armnn/21.05/_loaded_network_8cpp_source.xhtml

 //
 // Copyright © 2017 Arm Ltd and Contributors. All rights reserved.
 // SPDX-License-Identifier: MIT
 //

 #include "LoadedNetwork.hpp"
 #include "Layer.hpp"
 #include "Graph.hpp"
 #include "Network.hpp"
 #include <Processes.hpp>
 #include "Profiling.hpp"
 #include "HeapProfiling.hpp"
 #include "WorkingMemHandle.hpp"

 #include <armnn/BackendRegistry.hpp>
 #include <armnn/Logging.hpp>
 #include <armnn/utility/Assert.hpp>

 #include <backendsCommon/TensorHandle.hpp>
 #include <armnn/backends/IMemoryManager.hpp>
 #include <backendsCommon/MemCopyWorkload.hpp>
 #include <backendsCommon/MemSyncWorkload.hpp>

 #include <LabelsAndEventClasses.hpp>

 #include <fmt/format.h>
 #include <armnn/utility/Timer.hpp>

 namespace armnn
 {

 using namespace std;
 using namespace armnn::profiling;

 namespace
 {

 template <typename ExceptionType>
 std::string ToErrorMessage(const char * prefix, const ExceptionType & error)
 {
     std::stringstream ss;
     ss << prefix << " " << error.what();
     return ss.str();
 }

 void AddLayerStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
                        const Layer& layer,
                        ProfilingGuid networkGuid)
 {
     // Add layer to the post-optimisation network structure
     std::string layerName = layer.GetNameStr().empty() ? "<Unnamed>" : layer.GetNameStr();
     timelineUtils->CreateNamedTypedChildEntity(layer.GetGuid(),
                                                networkGuid,
                                                layerName,
                                                LabelsAndEventClasses::LAYER_GUID);
     for (auto&& input : layer.GetInputSlots())
     {
         const IOutputSlot* source = input.GetConnectedOutputSlot();
         ARMNN_ASSERT(source != NULL);
         timelineUtils->CreateConnectionRelationship(ProfilingRelationshipType::RetentionLink,
                                                     source->GetOwningLayerGuid(),
                                                     layer.GetGuid());
     }
 }

 void AddWorkloadStructure(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
                           std::unique_ptr<IWorkload>& workload,
                           const Layer& layer)
 {
     // Add workload to the post-optimisation network structure
     timelineUtils->CreateTypedEntity(workload->GetGuid(), LabelsAndEventClasses::WORKLOAD_GUID);
     timelineUtils->MarkEntityWithLabel(workload->GetGuid(),
                                        layer.GetBackendId().Get(),
                                        LabelsAndEventClasses::BACKENDID_GUID);

     // Link the workload to the layer
     timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
                                       layer.GetGuid(),
                                       workload->GetGuid(),
                                       LabelsAndEventClasses::CHILD_GUID);
 }

 } // anonymous

 std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
                                                                 std::string& errorMessage,
                                                                 const INetworkProperties& networkProperties,
                                                                 profiling::ProfilingService&  profilingService,
                                                                 const NetworkId networkIdOut)
 {
     std::unique_ptr<LoadedNetwork> loadedNetwork;

     auto Fail = [&](const std::exception& error) -> std::unique_ptr<LoadedNetwork>
     {
         errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error);
         ARMNN_LOG(error) << errorMessage;

         return std::unique_ptr<LoadedNetwork>();
     };

     try
     {
         loadedNetwork.reset(new LoadedNetwork(std::move(net), networkProperties, profilingService, networkIdOut));
     }
     catch (const armnn::RuntimeException& error)
     {
         return Fail(error);
     }
     catch (const armnn::Exception& error)
     {
         return Fail(error);
     }
     catch (const std::runtime_error& error)
     {
         return Fail(error);
     }

     return loadedNetwork;
 }

 LoadedNetwork::LoadedNetwork(std::unique_ptr<IOptimizedNetwork> net,
                              const INetworkProperties& networkProperties,
                              profiling::ProfilingService&  profilingService,
                              const NetworkId networkId) :
                              m_OptimizedNetwork(std::move(net)),
                              m_NetworkProperties(networkProperties),
                              m_NetworkId(networkId),
                              m_TensorHandleFactoryRegistry(),
                              m_ProfilingService(profilingService)
 {
     // Create a profiler and register it for the current thread.
     m_Profiler = std::make_shared<IProfiler>();
     ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get());

     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
     //First create tensor handlers, backends and workload factories.
     //Handlers are created before workloads are.
     //Because workload creation can modify some of the handlers,
     //(for example the splitter and concat layers).
     for (auto&& layer : order)
     {
         auto const& backendId = layer->GetBackendId();
         if (m_Backends.count(backendId) == 0)
         {
             auto createBackend = BackendRegistryInstance().GetFactory(backendId);
             auto it = m_Backends.emplace(std::make_pair(backendId, createBackend()));

             IBackendInternal* backend = it.first->second.get();

             if (backend->SupportsTensorAllocatorAPI())
             {
                 auto workloadFactory = backend->CreateWorkloadFactory(
                     m_TensorHandleFactoryRegistry, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions(),
                     static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
                     static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
                 m_WorkloadFactories.emplace(
                     std::make_pair(backendId, std::make_pair(std::move(workloadFactory), nullptr)));
             }
             else
             {
                 IBackendInternal::IMemoryManagerSharedPtr memoryManager = backend->CreateMemoryManager();
                 auto workloadFactory = backend->CreateWorkloadFactory(
                     memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());

                 m_WorkloadFactories.emplace(
                     std::make_pair(backendId, std::make_pair(std::move(workloadFactory), memoryManager)));
             }
         }
     }

     // Create the thread pool which will have working memory handles assigned to each thread
     // Should occur after factories are registered so thet the WorkingMemHandles can be created
     if (m_NetworkProperties.m_NumThreads > 0 && networkProperties.m_AsyncEnabled)
     {
         CreateThreadPool(m_NetworkProperties.m_NumThreads);
     }

     if (!networkProperties.m_AsyncEnabled)
     {
         for (auto &&layer : order)
         {
             auto &workloadFactory = GetWorkloadFactory(*layer);

             switch (layer->GetType())
             {
                 case LayerType::Input:
                 case LayerType::MemImport:
                 {
                     // If IsImportEnabled is true then we need to set IsMemoryManaged
                     // to false when creating TensorHandles
                     layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
                                                workloadFactory,
                                                !m_NetworkProperties.m_ImportEnabled);
                     break;
                 }
                 default:
                 {
                     // Look for a layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
                     // If Export is enabled disable memory management so we can export, otherwise we do a copy
                     if ((layer->GetNumOutputSlots() == 1) &&
                         (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
                         (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
                     {
                         layer->CreateTensorHandles(m_TensorHandleFactoryRegistry,
                                                    workloadFactory,
                                                    !m_NetworkProperties.m_ExportEnabled);
                     }
                     else
                     {
                         layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
                     }
                 }
             }
         }
     }

     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
     if (timelineUtils)
     {
         timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);
         // Mark the network with a start of life event
         timelineUtils->RecordEvent(networkGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
         // and with the process ID
         int processID = armnnUtils::Processes::GetCurrentId();
         std::stringstream ss;
         ss << processID;
         timelineUtils->MarkEntityWithLabel(networkGuid, ss.str(), LabelsAndEventClasses::PROCESS_ID_GUID);
     }

     //Then create workloads.
     for (auto&& layer : order)
     {
         if (timelineUtils)
         {
             // Add layer to the post-optimisation network structure
             AddLayerStructure(timelineUtils, *layer, networkGuid);
         }

         const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer);

         switch (layer->GetType())
         {
         case LayerType::Input:
         case LayerType::Output:
             {
                 // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
                 break;
             }
         default:
             {
                 auto workload = layer->CreateWorkload(workloadFactory);

                 if (!workload)
                 {
                     const char* const layerName =
                         layer->GetNameStr().length() != 0 ? layer->GetName() : "<Unnamed>";
                     throw InvalidArgumentException(
                         fmt::format("No workload created for layer (name: '{0}' type: '{1}') (compute '{2}')",
                                     layerName, static_cast<int>(layer->GetType()), layer->GetBackendId().Get()
                     ));
                 }

                 if (timelineUtils)
                 {
                     // Add workload to the post-optimisation network structure
                     AddWorkloadStructure(timelineUtils, workload, *layer);
                 }

                 // For async networks ConstantWorkloads are managed exclusively by LoadedNetwork
                 // and are separated out from the other workloads
                 if (networkProperties.m_AsyncEnabled && layer->GetType() == LayerType::Constant)
                 {
                     m_ConstantWorkloads[layer->GetGuid()] = std::move(workload);
                 }
                 else
                 {
                     m_WorkloadQueue.push_back(move(workload));
                 }

                 // release the constant data in the layer..
                 layer->ReleaseConstantData();
                 break;
             }
         }
     }

     for (auto&& workloadFactory : m_WorkloadFactories)
     {
         workloadFactory.second.first->AfterWorkloadsCreated();
     }

     if (timelineUtils)
     {
         // Commit to send the post-optimisation network structure
         timelineUtils->Commit();
     }

     if (!networkProperties.m_AsyncEnabled)
     {
         // Set up memory.
         m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().AllocateDynamicBuffers();

         // Now that the intermediate tensor memory has been set-up,
         // do any post allocation configuration for each workload.
         for (auto &workload : m_WorkloadQueue)
         {
             workload->PostAllocationConfigure();
         }
     }
     else
     {
         AllocateAndExecuteConstantWorkloads();
     }
 }

 void LoadedNetwork::AllocateAndExecuteConstantWorkloads()
 {
     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
     for (auto&& layer : order)
     {
         if (layer->GetType() == LayerType::Constant)
         {
             const auto& outSlot = layer->GetOutputSlots()[0];
             const auto factoryId = outSlot.GetTensorHandleFactoryId();
             ARMNN_ASSERT(factoryId != ITensorHandleFactory::LegacyFactoryId);
             auto& workloadFactory = GetWorkloadFactory(*layer);

             layer->CreateTensorHandles(m_TensorHandleFactoryRegistry, workloadFactory);
             ITensorHandle* tensorHandle = outSlot.GetOutputHandler().GetData();

             m_ConstantTensorHandles[layer->GetGuid()] = tensorHandle;
             tensorHandle->Allocate();

             WorkingMemDescriptor memDesc;
             memDesc.m_Outputs.push_back(tensorHandle);
             m_ConstantWorkloads[layer->GetGuid()]->ExecuteAsync(memDesc);
         }
     }
 }


 void LoadedNetwork::SendNetworkStructure()
 {
     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
     ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();

     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);

     timelineUtils->CreateTypedEntity(networkGuid, LabelsAndEventClasses::NETWORK_GUID);

     for (auto&& layer : order)
     {
         // Add layer to the post-optimisation network structure
         AddLayerStructure(timelineUtils, *layer, networkGuid);
         switch (layer->GetType())
         {
         case LayerType::Input:
         case LayerType::Output:
         {
             // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput().
             break;
         }
         default:
             {
             for (auto& workload : m_WorkloadQueue)
             {
                 // Add workload to the post-optimisation network structure
                 AddWorkloadStructure(timelineUtils, workload, *layer);
             }
             break;
             }
         }
     }
     // Commit to send the post-optimisation network structure
     timelineUtils->Commit();
 }

 profiling::ProfilingGuid LoadedNetwork::GetNetworkGuid()
 {
     return m_OptimizedNetwork->GetGuid();
 }

 TensorInfo LoadedNetwork::GetInputTensorInfo(LayerBindingId layerId) const
 {
     for (auto&& inputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetInputLayers())
     {
         ARMNN_ASSERT_MSG(inputLayer->GetNumOutputSlots() == 1, "Input layer should have exactly 1 output slot");
         if (inputLayer->GetBindingId() == layerId)
         {
             return inputLayer->GetOutputSlot(0).GetTensorInfo();
         }
     }

     throw InvalidArgumentException(fmt::format("No input layer is associated with id {}", layerId));
 }

 TensorInfo LoadedNetwork::GetOutputTensorInfo(LayerBindingId layerId) const
 {
     for (auto&& outputLayer : m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().GetOutputLayers())
     {
         ARMNN_ASSERT_MSG(outputLayer->GetNumInputSlots() == 1, "Output layer should have exactly 1 input slot");
         ARMNN_ASSERT_MSG(outputLayer->GetInputSlot(0).GetConnection(), "Input slot on Output layer must be connected");
         if (outputLayer->GetBindingId() == layerId)
         {
             return outputLayer->GetInputSlot(0).GetConnection()->GetTensorInfo();
         }
     }

     throw InvalidArgumentException(fmt::format("No output layer is associated with id {}", layerId));
 }

 const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) const
 {
     const IWorkloadFactory* workloadFactory = nullptr;

     auto it = m_WorkloadFactories.find(layer.GetBackendId());
     if (it ==  m_WorkloadFactories.end())
     {
         throw RuntimeException(fmt::format("No workload factory for {0} to be used for layer: {1}",
                                            layer.GetBackendId().Get(),
                                            layer.GetNameStr()),
                                            CHECK_LOCATION());
     }

     workloadFactory = it->second.first.get();

     ARMNN_ASSERT_MSG(workloadFactory, "No workload factory");

     std::string reasonIfUnsupported;
     ARMNN_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer,
                                                         {},
                                                         reasonIfUnsupported,
                                                         m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions()),
         "Factory does not support layer");
     IgnoreUnused(reasonIfUnsupported);
     return *workloadFactory;
 }

 namespace {

 // Non-copyable class owning accelerator-specific tensor data.
 class TensorPin
 {
 public:
     TensorPin(std::unique_ptr<ITensorHandle> handle, const TensorInfo& info, LayerBindingId id)
         : m_TensorHandle(std::move(handle))
         , m_TensorInfo(info)
         , m_Id(id)
     {
     }

     ITensorHandle* GetTensorHandle() const { return m_TensorHandle.get(); }
     const TensorInfo& GetTensorInfo() const { return m_TensorInfo; }
     LayerBindingId GetBindingId() const { return m_Id; }

 private:
     std::unique_ptr<ITensorHandle> m_TensorHandle;
     TensorInfo m_TensorInfo;
     LayerBindingId m_Id;
 };

 static const TensorPin& GetTensorPin(LayerBindingId id,
     const std::vector<TensorPin>& pins,
     char const* bindingPointDesc)
 {
     auto it = std::find_if(pins.begin(), pins.end(),
         [id](const TensorPin& pin)
     {
         return pin.GetBindingId() == id;
     });

     if (it != pins.end())
     {
         return *it;
     }
     else
     {
         throw InvalidArgumentException(fmt::format("No tensor supplied for {0} {1}", bindingPointDesc, id));
     }
 }

 // Stores data that needs to be kept accessible for the entire execution of a workload.
 class WorkloadData
 {
 public:
     WorkloadData(const InputTensors& inputTensors, const OutputTensors& outputTensors)
     {
         m_InputTensorPins.reserve(inputTensors.size());
         m_OutputTensorPins.reserve(outputTensors.size());

         for (auto inputTensorPair : inputTensors)
         {
             auto inputTensor = inputTensorPair.second;

             std::unique_ptr<ITensorHandle> tensorHandle =
                 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),inputTensor.GetMemoryArea());
             LayerBindingId layerId = inputTensorPair.first;

             m_InputTensorPins.emplace_back(std::move(tensorHandle), inputTensor.GetInfo(), layerId);
         }

         for (auto outputTensorPair : outputTensors)
         {
             auto outputTensor = outputTensorPair.second;

             std::unique_ptr<ITensorHandle> tensorHandle =
                 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(), outputTensor.GetMemoryArea());
             LayerBindingId layerId = outputTensorPair.first;

             m_OutputTensorPins.emplace_back(std::move(tensorHandle), outputTensor.GetInfo(), layerId);
         }
     }

     const TensorPin& GetInputTensorPin(LayerBindingId id) const
     {
         return GetTensorPin(id, m_InputTensorPins, "input");
     }

     const TensorPin& GetOutputTensorPin(LayerBindingId id) const
     {
         return GetTensorPin(id, m_OutputTensorPins, "output");
     }

 private:

     std::vector<TensorPin> m_InputTensorPins;
     std::vector<TensorPin> m_OutputTensorPins;
 };

 }

 Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors,
                                       const OutputTensors& outputTensors)
 {
     const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();

     // Walk graph to determine the order of execution.
     if (graph.GetNumLayers() < 2)
     {
         ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
         return Status::Failure;
     }

     // Data that must be kept alive for the entire execution of the workload.
     WorkloadData workloadData(inputTensors, outputTensors);

     if (graph.GetNumInputs() != inputTensors.size())
     {
         throw InvalidArgumentException("Number of inputs provided does not match network.");
     }

     // For each input to the network, call EnqueueInput with the data passed by the user.
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
         m_InputQueue.clear();
         m_InputQueue.reserve(graph.GetNumInputs());
         for (const BindableLayer* inputLayer : graph.GetInputLayers())
         {
             const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId());
             EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
         }
     }

     // For each output to the network, call EnqueueOutput with the data passed by the user.
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
         m_OutputQueue.clear();
         m_OutputQueue.reserve(graph.GetNumOutputs());
         for (const BindableLayer* outputLayer : graph.GetOutputLayers())
         {
             const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId());
             EnqueueOutput(*outputLayer, pin.GetTensorHandle(), pin.GetTensorInfo());
         }
     }

     std::unique_ptr<TimelineUtilityMethods> timelineUtils =
                         TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
     ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
     if (timelineUtils)
     {
         // Add inference timeline trace if profiling is enabled.
         ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
         timelineUtils->CreateTypedEntity(inferenceGuid, LabelsAndEventClasses::INFERENCE_GUID);
         timelineUtils->CreateRelationship(ProfilingRelationshipType::RetentionLink,
                                           networkGuid,
                                           inferenceGuid,
                                           LabelsAndEventClasses::EXECUTION_OF_GUID);
         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
     }

     bool executionSucceeded = true;

     {
         if (m_ProfilingService.IsProfilingEnabled())
         {
             m_ProfilingService.IncrementCounterValue(armnn::profiling::INFERENCES_RUN);
         }
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Execute");
         ARMNN_SCOPED_HEAP_PROFILING("Executing");
         executionSucceeded = Execute(timelineUtils, inferenceGuid);
     }

     if (timelineUtils)
     {
         // Add end of life of the inference timeline if profiling is enabled.
         timelineUtils->RecordEvent(inferenceGuid, LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
         timelineUtils->Commit();
     }
     return executionSucceeded ? Status::Success : Status::Failure;
 }

 void LoadedNetwork::EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
 {
     if (layer.GetType() != LayerType::Input)
     {
         throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
     }

     if (tensorHandle == nullptr)
     {
         throw InvalidArgumentException("EnqueueInput: tensorHandle must not be NULL");
     }

     InputQueueDescriptor inputQueueDescriptor;
     WorkloadInfo info;

     inputQueueDescriptor.m_Inputs.push_back(tensorHandle);
     info.m_InputTensorInfos.push_back(tensorInfo);

     ARMNN_ASSERT_MSG(layer.GetNumOutputSlots() == 1, "Can only handle Input Layer with one output");
     const OutputHandler& handler = layer.GetOutputHandler();
     const TensorInfo& outputTensorInfo = handler.GetTensorInfo();
     ITensorHandle* outputTensorHandle = handler.GetData();
     ARMNN_ASSERT_MSG(outputTensorHandle != nullptr,
                      "Data should have been allocated.");
     inputQueueDescriptor.m_Outputs.push_back(outputTensorHandle);
     info.m_OutputTensorInfos.push_back(outputTensorInfo);

     MemorySourceFlags importFlags = outputTensorHandle->GetImportFlags();
     bool needMemCopy = true;
     if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
     {
         if(CheckFlag(importFlags, m_NetworkProperties.m_InputSource))
         {
             needMemCopy = false;
             // This assumes a CPU Tensor handle
             void* mem = tensorHandle->Map(false);
             if (outputTensorHandle->Import(mem, m_NetworkProperties.m_InputSource))
             {
                 tensorHandle->Unmap();
                 return; // No need for a workload since the import has been done.
             }
             tensorHandle->Unmap();
             throw MemoryImportException("EnqueueInput: Memory Import failed");
         }
     }
     if (needMemCopy)
     {
         // Create a mem copy workload for input since we did not import
         std::unique_ptr<IWorkload> inputWorkload = std::make_unique<CopyMemGenericWorkload>(inputQueueDescriptor, info);

         ARMNN_ASSERT_MSG(inputWorkload, "No input workload created");

         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
                             TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
         if (timelineUtils)
         {
             // Add Input Workload to the post-optimisation network structure
             AddWorkloadStructure(timelineUtils, inputWorkload, layer);
             timelineUtils->Commit();
         }

         m_InputQueue.push_back(move(inputWorkload));
     }
 }

 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo)
 {
     if (layer.GetType() != LayerType::Output)
     {
         throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
     }

     if (tensorHandle == nullptr)
     {
         throw InvalidArgumentException("EnqueueOutput: tensorHandle must not be NULL");
     }

     OutputQueueDescriptor outputQueueDescriptor;
     WorkloadInfo info;

     outputQueueDescriptor.m_Outputs.push_back(tensorHandle);
     info.m_OutputTensorInfos.push_back(tensorInfo);

     ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");

     // Gets the output handler from the previous node.
     const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler();

     const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo();
     ITensorHandle* inputTensorHandle = outputHandler.GetData();
     ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");

     // Try import the output tensor.
     // Note: We can only import the output pointer if all of the following  hold true:
     // a) The imported pointer is aligned sufficiently
     // b) The tensor has zero padding
     // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
     // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
     // e) m_IsExportEnabled must be set to true
     bool needMemCopy = true;
     if (m_NetworkProperties.m_ExportEnabled &&
         (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
     {
         if(layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
         {
             MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
             if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
             {
                 needMemCopy = false;
                 void *mem = tensorHandle->Map(false);
                 bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
                 tensorHandle->Unmap();

                 if (importOk)
                 {
                     // Insert synchronization workload
                     MemSyncQueueDescriptor syncDesc;
                     syncDesc.m_Inputs.push_back(inputTensorHandle);
                     info.m_InputTensorInfos.push_back(inputTensorInfo);
                     auto syncWorkload = std::make_unique<SyncMemGenericWorkload>(syncDesc, info);
                     ARMNN_ASSERT_MSG(syncWorkload, "No sync workload created");
                     m_OutputQueue.push_back(move(syncWorkload));
                 }
                 else
                 {
                     throw MemoryExportException("EnqueueOutput: Memory Export failed");
                 }
             }
         }
     }
     if (needMemCopy)
     {
         // If we got here then we didn't export the memory, so add an output workload which performs a memcopy.
         outputQueueDescriptor.m_Inputs.push_back(inputTensorHandle);
         info.m_InputTensorInfos.push_back(inputTensorInfo);

         std::unique_ptr<IWorkload> outputWorkload =
             std::make_unique<CopyMemGenericWorkload>(outputQueueDescriptor, info);
         ARMNN_ASSERT_MSG(outputWorkload, "No output workload created");

         std::unique_ptr<TimelineUtilityMethods> timelineUtils =
             TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
         if (timelineUtils)
         {
             // Add Output Workload to the post-optimisation network structure
             AddWorkloadStructure(timelineUtils, outputWorkload, layer);
             timelineUtils->Commit();
         }

         m_OutputQueue.push_back(move(outputWorkload));
     }
 }

 void LoadedNetwork::AllocateWorkingMemory(std::lock_guard<std::mutex>& lock)
 {
     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "Working Memory Allocation");

     // this unused parameter makes sure we can only call this function with a valid lock
     IgnoreUnused(lock);

     if (m_IsWorkingMemAllocated)
     {
         return;
     }
     for (auto&& workloadFactory : m_WorkloadFactories)
     {
         IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
         if (memoryManager)
         {
             memoryManager->Acquire();
         }
     }
     m_TensorHandleFactoryRegistry.AquireMemory();
     m_IsWorkingMemAllocated = true;
 }

 void LoadedNetwork::FreeWorkingMemory()
 {
     std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
     if (!m_IsWorkingMemAllocated)
     {
         return;
     }
     // Informs the memory managers to release memory in it's respective memory group
     for (auto&& workloadFactory : m_WorkloadFactories)
     {
         IBackendInternal::IMemoryManagerSharedPtr memoryManager = workloadFactory.second.second;
         if (memoryManager)
         {
             memoryManager->Release();
         }
     }
     m_TensorHandleFactoryRegistry.ReleaseMemory();
     m_IsWorkingMemAllocated = false;
 }

 bool LoadedNetwork::Execute(std::unique_ptr<TimelineUtilityMethods>& timelineUtils,
                             profiling::ProfilingGuid inferenceGuid)
 {
     bool success = true;

     auto Fail = [&](const std::exception& error)
     {
         ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
         success = false;
     };

     try
     {
         std::lock_guard<std::mutex> lockGuard(m_WorkingMemMutex);
         AllocateWorkingMemory(lockGuard);

         ProfilingDynamicGuid workloadInferenceID(0);
         auto ExecuteQueue = [&timelineUtils, &workloadInferenceID, &inferenceGuid](WorkloadQueue& queue)
         {
             for (auto& workload : queue)
             {
                 if(timelineUtils)
                 {
                     workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
                                                                                                     inferenceGuid);
                 }
                 workload->Execute();
                 if(timelineUtils)
                 {
                     timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
                 }
             }
         };

         ExecuteQueue(m_InputQueue);
         ExecuteQueue(m_WorkloadQueue);
         ExecuteQueue(m_OutputQueue);
     }
     catch (const RuntimeException& error)
     {
         Fail(error);
     }
     catch (const std::runtime_error& error)
     {
         Fail(error);
     }

     return success;
 }

 void LoadedNetwork::CreateThreadPool(std::size_t numThreads)
 {

     for (auto i = 0u; i < numThreads; ++i)
     {
         std::unique_ptr<IWorkingMemHandle> workingMemHandle = CreateWorkingMemHandle(m_NetworkId);
         m_Threads.emplace_back(
             std::make_unique<std::thread>(
                 &LoadedNetwork::ProcessExecPriorities,
                 this,
                 std::move(workingMemHandle)
             )
         );
     }
 }

 void LoadedNetwork::TerminateThreadPool() noexcept
 {
     {
         std::unique_lock<std::mutex> threadPoolLock(m_ThreadPoolMutex);
         m_TerminatePool = true;
     }

     m_ThreadPoolEvent.notify_all();

     for (auto &thread : m_Threads)
     {
         thread->join();
     }
 }

 void LoadedNetwork::Schedule(const InputTensors& inputTensors,
                              const OutputTensors& outputTensors,
                              const QosExecPriority priority,
                              std::shared_ptr<IAsyncExecutionCallback> cb)
 {
     // Group execution parameters so that they can be easily added to the queue
     ExecutionTuple groupExecParams = std::make_tuple(inputTensors, outputTensors, cb);
     std::shared_ptr<ExecutionTuple> operation = make_shared<ExecutionTuple>(groupExecParams);

     // Add a message to the queue and notify the request thread
     std::unique_lock<std::mutex> lock(m_ThreadPoolMutex);
     switch (priority) {
         case QosExecPriority::High:
             m_HighPriorityQueue.push(operation);
             break;
         case QosExecPriority::Low:
             m_LowPriorityQueue.push(operation);
             break;
         case QosExecPriority::Medium:
         default:
             m_MediumPriorityQueue.push(operation);
     }
     m_ThreadPoolEvent.notify_one();
 }

 void LoadedNetwork::ProcessExecPriorities(std::unique_ptr<IWorkingMemHandle> workingMemHandle)
 {
     int expireRate          = EXPIRE_RATE;
     int highPriorityCount   = 0;
     int mediumPriorityCount = 0;

     IWorkingMemHandle& workingMemHandleRef = *workingMemHandle.get();

     while (true)
     {
         std::shared_ptr<ExecutionTuple> currentExecInProgress(nullptr);
         {
             // Wait for a message to be added to the queue
             // This is in a separate scope to minimise the lifetime of the lock
             std::unique_lock<std::mutex> lock(m_ThreadPoolMutex);

             m_ThreadPoolEvent.wait(lock,
                                    [=] {
                                        return m_TerminatePool || !m_HighPriorityQueue.empty() ||
                                               !m_MediumPriorityQueue.empty() || !m_LowPriorityQueue.empty();
                                    });

             if (m_TerminatePool && m_HighPriorityQueue.empty() && m_MediumPriorityQueue.empty() &&
                 m_LowPriorityQueue.empty())
             {
                 break;
             }

             // Get the message to process from the front of each queue based on priority from high to low
             // Get high priority first if it does not exceed the expire rate
             if (!m_HighPriorityQueue.empty() && highPriorityCount < expireRate)
             {
                 currentExecInProgress = m_HighPriorityQueue.front();
                 m_HighPriorityQueue.pop();
                 highPriorityCount += 1;
             }
             // If high priority queue is empty or the count exceeds the expire rate, get medium priority message
             else if (!m_MediumPriorityQueue.empty() && mediumPriorityCount < expireRate)
             {
                 currentExecInProgress = m_MediumPriorityQueue.front();
                 m_MediumPriorityQueue.pop();
                 mediumPriorityCount += 1;
                 // Reset high priority count
                 highPriorityCount     = 0;
             }
             // If medium priority queue is empty or the count exceeds the expire rate, get low priority message
             else if (!m_LowPriorityQueue.empty())
             {
                 currentExecInProgress = m_LowPriorityQueue.front();
                 m_LowPriorityQueue.pop();
                 // Reset high and medium priority count
                 highPriorityCount   = 0;
                 mediumPriorityCount = 0;
             }
             else
             {
                 // Reset high and medium priority count
                 highPriorityCount   = 0;
                 mediumPriorityCount = 0;
                 continue;
             }
         }

         // invoke the asynchronous execution method
         auto inputTensors  = std::get<0>(*currentExecInProgress);
         auto outputTensors = std::get<1>(*currentExecInProgress);
         auto cb            = std::get<2>(*currentExecInProgress);

         // Get time at start of inference
         HighResolutionClock startTime = armnn::GetTimeNow();

         try // executing the inference
         {
             // Execute and populate the time at end of inference in the callback
             Execute(inputTensors, outputTensors, workingMemHandleRef) == Status::Success ?
                 cb->Notify(Status::Success, std::make_pair(startTime, armnn::GetTimeNow())) :
                 cb->Notify(Status::Failure, std::make_pair(startTime, armnn::GetTimeNow()));
         }
         catch (const RuntimeException& error)
         {
             cb->Notify(Status::Failure, std::make_pair(startTime, armnn::GetTimeNow()));
         }
     }
 }

 void LoadedNetwork::EnqueueInput(const BindableLayer& layer,
                                  const ConstTensor& inputTensor,
                                  WorkingMemHandle& context)
 {
     if (layer.GetType() != LayerType::Input)
     {
         throw InvalidArgumentException("EnqueueInput: given layer not an InputLayer");
     }
     LayerGuid id = layer.GetGuid();
     WorkingMemDescriptor descriptor = context.GetWorkingMemDescriptor(id);

     MemorySourceFlags importFlags = descriptor.m_Outputs[0]->GetImportFlags();
     if (m_NetworkProperties.m_ImportEnabled)  // Try import the input tensor
     {
         if (CheckFlag(importFlags, m_NetworkProperties.m_InputSource) )
         {
             // This assumes a CPU Tensor handle
             std::unique_ptr<ITensorHandle> tensorHandle =
                     std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(),
                                                                       inputTensor.GetMemoryArea());

             void* mem = tensorHandle->Map(false);
             if (descriptor.m_Outputs[0]->Import(mem, m_NetworkProperties.m_InputSource))
             {
                 tensorHandle->Unmap();
                 return;
             }
             tensorHandle->Unmap();
             throw MemoryImportException("EnqueueInput: Memory Import failed");
         }
         else
         {
             throw MemoryImportException("EnqueueInput: Memory Import failed, backend does not support Import");
         }
     }
     else
     {
         std::unique_ptr<ITensorHandle> tensorHandle =
                 std::make_unique<ConstPassthroughTensorHandle>(inputTensor.GetInfo(), inputTensor.GetMemoryArea());

         auto copyFunc = [](void* dst, const void* src, size_t size)
         {
             memcpy(dst, src, size);
         };

         for (const auto& input : descriptor.m_Outputs)
         {
             CopyTensorContentsGeneric(tensorHandle.get(), input, copyFunc);
         }
     }
 }

 void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, const Tensor& outputTensor, WorkingMemHandle& handle)
 {
     if (layer.GetType() != LayerType::Output)
     {
         throw InvalidArgumentException("EnqueueOutput: given layer not an OutputLayer");
     }
     ARMNN_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input.");

     LayerGuid id = layer.GetGuid();
     WorkingMemDescriptor descriptor = handle.GetWorkingMemDescriptor(id);

     ITensorHandle* inputTensorHandle = descriptor.m_Inputs[0];
     ARMNN_ASSERT_MSG(inputTensorHandle != nullptr, "Data should have been allocated.");

     // Try import the output tensor.
     // Note: We can only import the output pointer if all of the following  hold true:
     // a) The imported pointer is aligned sufficiently
     // b) The tensor has zero padding
     // c) There is only one connection to the OutputSlot and it is to an OutputLayer.
     // d) The output pointer is allocated via malloc. (Other types will be supported in a later release)
     // e) m_IsExportEnabled must be set to true
     if (m_NetworkProperties.m_ExportEnabled &&
         (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetNumConnections() == 1))
     {
         if (layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOwningLayer().GetType() != LayerType::Input)
         {
             MemorySourceFlags importFlags = inputTensorHandle->GetImportFlags();
             if (CheckFlag(importFlags, m_NetworkProperties.m_OutputSource))
             {
                 std::unique_ptr<ITensorHandle> tensorHandle =
                         std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
                                                                      outputTensor.GetMemoryArea());

                 void* mem = tensorHandle->Map(false);
                 bool importOk = inputTensorHandle->Import(mem, m_NetworkProperties.m_OutputSource);
                 tensorHandle->Unmap();

                 if (importOk)
                 {
                     ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "SyncMemGeneric_Execute");
                     inputTensorHandle->Map(true);
                     inputTensorHandle->Unmap();
                 }
                 else
                 {
                     throw MemoryExportException("EnqueueOutput: Memory Export failed");
                 }
             }
             else
             {
                 throw MemoryExportException("EnqueueOutput: Memory Export failed, backend does not support Export");
             }
         }
         else
         {
             throw MemoryExportException("EnqueueOutput: Memory Export failed, attempting to export Input Layer");
         }
     }
     else
     {
         auto copyFunc = [](void* dst, const void* src, size_t size)
         {
             memcpy(dst, src, size);
         };

         std::unique_ptr<ITensorHandle> tensorHandle =
                 std::make_unique<PassthroughTensorHandle>(outputTensor.GetInfo(),
                                                              outputTensor.GetMemoryArea());

         CopyTensorContentsGeneric(inputTensorHandle, tensorHandle.get(), copyFunc);
     }
 }


 const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors& inputTensors)
 {
     for (auto inputTensorPair : inputTensors)
     {
         LayerBindingId id = inputTensorPair.first;
         if (id == layerId)
         {
             return inputTensorPair.second;
         }
     }
     throw InvalidArgumentException("Input does not exist.");
 }

 const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors& outputTensors)
 {
     for (auto outputTensorPair : outputTensors)
     {
         LayerBindingId id = outputTensorPair.first;
         if (id == layerId)
         {
             return outputTensorPair.second;
         }
     }
     throw InvalidArgumentException("Output does not exist.");
 }

 Status LoadedNetwork::Execute(const InputTensors& inputTensors,
                               const OutputTensors& outputTensors,
                               IWorkingMemHandle& iWorkingMemHandle)
 {
     const Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();

     // Walk graph to determine the order of execution.
     if (graph.GetNumLayers() < 2)
     {
         ARMNN_LOG(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph";
         return Status::Failure;
     }

     if (graph.GetNumInputs() != inputTensors.size())
     {
         throw InvalidArgumentException("Number of inputs provided does not match network.");
     }

     std::unique_ptr<profiling::TimelineUtilityMethods> timelineUtils =
             profiling::TimelineUtilityMethods::GetTimelineUtils(m_ProfilingService);
     profiling::ProfilingGuid inferenceGuid = m_ProfilingService.GetNextGuid();
     if (timelineUtils)
     {
         // Add inference timeline trace if profiling is enabled.
         profiling::ProfilingGuid networkGuid = m_OptimizedNetwork->GetGuid();
         timelineUtils->CreateTypedEntity(inferenceGuid, profiling::LabelsAndEventClasses::INFERENCE_GUID);
         timelineUtils->CreateRelationship(profiling::ProfilingRelationshipType::RetentionLink,
                                           networkGuid,
                                           inferenceGuid,
                                           profiling::LabelsAndEventClasses::EXECUTION_OF_GUID);
         timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS);
     }

     bool executionSucceeded = true;

     if (timelineUtils)
     {
         // Add end of life of the inference timeline if profiling is enabled.
         timelineUtils->RecordEvent(inferenceGuid, profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS);
         timelineUtils->Commit();
     }
     WorkingMemHandle& workingMemHandle = dynamic_cast<WorkingMemHandle&>(iWorkingMemHandle);
     std::lock_guard<std::mutex> lockGuard(workingMemHandle.GetMutex());

     if (!workingMemHandle.IsAllocated())
     {
         workingMemHandle.Allocate();
     }

     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareInputs");
         for (const BindableLayer* inputLayer : graph.GetInputLayers())
         {
             EnqueueInput(*inputLayer, GetInputTensor(inputLayer->GetBindingId(), inputTensors), workingMemHandle);
         }
     }

     auto Fail = [&](const std::exception& error)
     {
         ARMNN_LOG(error) << "An error occurred attempting to execute a workload: " << error.what();
         executionSucceeded = false;
     };
     profiling::ProfilingDynamicGuid workloadInferenceID(0);

     try
     {
         for (unsigned int i = 0; i < m_WorkloadQueue.size(); ++i)
         {
             auto& workload = m_WorkloadQueue[i];
             if (timelineUtils)
             {
                 workloadInferenceID = timelineUtils->RecordWorkloadInferenceAndStartOfLifeEvent(workload->GetGuid(),
                                                                                                 inferenceGuid);
             }
             workload->ExecuteAsync(workingMemHandle.GetWorkingMemDescriptorAt(i));

             if (timelineUtils)
             {
                 timelineUtils->RecordEndOfLifeEvent(workloadInferenceID);
             }
         }
     }
     catch (const RuntimeException& error)
     {
         Fail(error);
     }
     catch (const std::runtime_error& error)
     {
         Fail(error);
     }
     // For each output to the network, call EnqueueOutput with the data passed by the user.
     {
         ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "PrepareOutputs");
         for (const BindableLayer *outputLayer : graph.GetOutputLayers())
         {
             EnqueueOutput(*outputLayer, GetOutputTensor(outputLayer->GetBindingId(), outputTensors), workingMemHandle);
         }
     }

     return executionSucceeded ? Status::Success : Status::Failure;
 }

 /// Create a new unique WorkingMemHandle object. Create multiple handles if you wish to have
 /// overlapped Execution by calling this function from different threads.
 std::unique_ptr<IWorkingMemHandle> LoadedNetwork::CreateWorkingMemHandle(NetworkId networkId)
 {
     Graph& order = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph();
     std::unordered_map<LayerGuid, std::vector<std::unique_ptr<ITensorHandle> > > tensorHandleMap;
     std::vector<WorkingMemDescriptor> workingMemDescriptors;
     std::unordered_map<LayerGuid, WorkingMemDescriptor> workingMemDescriptorMap;
     TensorHandleFactoryRegistry tensorHandleFactoryRegistry;
     WorkloadFactoryMap workloadFactoryMap;

     std::vector<std::shared_ptr<IMemoryManager>> memoryManagers;

     for (auto const& backend : m_Backends)
     {
         if (backend.second->SupportsTensorAllocatorAPI())
         {
             backend.second->RegisterTensorHandleFactories(
                 tensorHandleFactoryRegistry,
                 static_cast<MemorySourceFlags>(m_NetworkProperties.m_InputSource),
                 static_cast<MemorySourceFlags>(m_NetworkProperties.m_OutputSource));
             memoryManagers.emplace_back(tensorHandleFactoryRegistry.GetMemoryManagers().back());
         }
         else
         {
             std::shared_ptr<IMemoryManager> memoryManager = backend.second->CreateMemoryManager();
             auto workloadFactory = backend.second->CreateWorkloadFactory(
                     memoryManager, m_OptimizedNetwork->pOptimizedNetworkImpl->GetModelOptions());

             workloadFactoryMap.emplace(
                     std::make_pair(backend.first, std::make_pair(std::move(workloadFactory), memoryManager)));
             memoryManagers.emplace_back(memoryManager);
         }
     }

     auto GetTensorHandle = [&](Layer* layer, const OutputSlot& outputSlot, bool isMemoryManaged)
     {
         ITensorHandleFactory::FactoryId factoryId = outputSlot.GetTensorHandleFactoryId();
         const TensorInfo& tensorInfo = outputSlot.GetTensorInfo();

         if (factoryId == ITensorHandleFactory::LegacyFactoryId)
         {
             BackendId id = layer->GetBackendId();
             ARMNN_NO_DEPRECATE_WARN_BEGIN
             return workloadFactoryMap.at(id).first->CreateTensorHandle(tensorInfo, isMemoryManaged);
             ARMNN_NO_DEPRECATE_WARN_END
         }
         else
         {
             ITensorHandleFactory* handleFactory = tensorHandleFactoryRegistry.GetFactory(factoryId);
             ARMNN_ASSERT(handleFactory);
             return handleFactory->CreateTensorHandle(tensorInfo, isMemoryManaged);
         }
     };

     std::unordered_map<const ITensorHandle*, unsigned int> handleReferenceCounts;
     for (auto&& layer : order)
     {
         WorkingMemDescriptor workingMemDescriptor;

         // Constant layers execution and management is handled during loaded network construction
         if (layer->GetType() == LayerType::Constant)
         {
             continue;
         }
         bool isMemoryManaged = true;
         bool isInputLayer = true;
         // Look for the layer with 1 OutputSlot which has 1 connection and that connection is an Output Layer
         // If Export is enabled disable memory management so we can export, otherwise we do a copy
         if ((layer->GetNumOutputSlots() == 1) &&
             (layer->GetOutputSlots()[0].GetNumConnections() == 1) &&
             (layer->GetOutputSlots()[0].GetConnection(0)->GetOwningLayer().GetType() == LayerType::Output))
         {
             isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
         }
         else if (layer->GetType() == LayerType::Input || layer->GetType() == LayerType::MemImport)
         {
             // Input layers/workloads will not be executed so the descriptor is not added to workingMemDescriptors
             // However we will still need to manage the tensorHandle
             isInputLayer = false;
             isMemoryManaged = !m_NetworkProperties.m_ExportEnabled;
         }

         // Create a tensor handle for each output slot of a layer
         // Once we create it, we start managing its lifetime
         for (auto& slot : layer->GetOutputSlots())
         {
             tensorHandleMap[layer->GetGuid()].emplace_back(GetTensorHandle(layer, slot, isMemoryManaged));
             ITensorHandle* tensorHandle = tensorHandleMap[layer->GetGuid()].back().get();

             workingMemDescriptor.m_Outputs.push_back(tensorHandle);
             tensorHandle->Manage();
             unsigned int numConnections = slot.GetNumConnections();
             ARMNN_ASSERT(numConnections != 0);

             handleReferenceCounts[tensorHandle] = numConnections;
         }
         // Loop through the input slots in the same layer and decrement the reference counter associated
         // to each tensor handle we encounter.
         // Once it reaches zero, the lifetime of the tensor handle has ended, and we mark it's memory as available
         // so that the next tensor handle with a non overlapping lifetime can share it's memory.
         for (auto& slot : layer->GetInputSlots())
         {
             ARMNN_ASSERT(slot.GetConnection());
             auto outputSlot = slot.GetConnectedOutputSlot();
             auto key = outputSlot->GetOwningLayer().GetGuid();

             // Constant layers execution and management is handled during loaded network construction
             auto found = m_ConstantTensorHandles.find(key);
             if (found != m_ConstantTensorHandles.end())
             {
                 workingMemDescriptor.m_Inputs.push_back(found->second);
                 continue;
             }

             auto search = tensorHandleMap.find(key);
             unsigned int index = outputSlot->CalculateIndexOnOwner();
             ITensorHandle* inputTensorHandle = search->second[index].get();
             workingMemDescriptor.m_Inputs.push_back(inputTensorHandle);
             --handleReferenceCounts.at(inputTensorHandle);
             if (handleReferenceCounts.at(inputTensorHandle) == 0u)
             {
                 // Stop managing lifetime of tensor handle
                 inputTensorHandle->Allocate();
                 handleReferenceCounts.erase(inputTensorHandle);
             }
         }
         workingMemDescriptorMap.insert({layer->GetGuid(), workingMemDescriptor});

         // Input layers/workloads will not be executed, so the descriptor is not added to workingMemDescriptors
         // However we will still need to manage the tensorHandle
         if (isInputLayer)
         {
             workingMemDescriptors.push_back(workingMemDescriptor);
         }
     }

     return std::make_unique<WorkingMemHandle>(networkId,
                                               workingMemDescriptors,
                                               workingMemDescriptorMap,
                                               memoryManagers,
                                               std::move(tensorHandleMap));
 }

 void LoadedNetwork::RegisterDebugCallback(const DebugCallbackFunction& func)
 {
     for (auto&& workloadPtr: m_WorkloadQueue)
     {
         workloadPtr.get()->RegisterDebugCallback(func);
     }
 }

 }
armnn::profiling::LabelsAndEventClasses::INFERENCE_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid INFERENCE_GUID
Definition: LabelsAndEventClasses.hpp:51

MemCopyWorkload.hpp

armnn::LoadedNetwork::CreateWorkingMemHandle
std::unique_ptr< IWorkingMemHandle > CreateWorkingMemHandle(NetworkId networkId)
Create a new unique WorkingMemHandle object.
Definition: LoadedNetwork.cpp:1258

armnn::ITensorHandle::Import
virtual bool Import(void *memory, MemorySource source)
Import externally allocated memory.
Definition: ITensorHandle.hpp:75

armnn::BackendRegistry::GetFactory
FactoryFunction GetFactory(const BackendId &id) const
Definition: BackendRegistry.cpp:54

armnn::Layer::GetNumInputSlots
unsigned int GetNumInputSlots() const override
Returns the number of connectable input slots.
Definition: Layer.hpp:313

armnn::HighResolutionClock
std::chrono::high_resolution_clock::time_point HighResolutionClock
Define a timer and associated inference ID for recording execution times.
Definition: Types.hpp:319

armnn::EXPIRE_RATE
constexpr unsigned int EXPIRE_RATE
Variable to control expire rate of priority queue.
Definition: Types.hpp:25

armnn::IBackendInternal
Definition: IBackendInternal.hpp:68

armnn::profiling::TimelineUtilityMethods::GetTimelineUtils
static std::unique_ptr< TimelineUtilityMethods > GetTimelineUtils(ProfilingService &profilingService)
Definition: TimelineUtilityMethods.cpp:15

armnn::ProfilerManager::GetInstance
static ProfilerManager & GetInstance()
Definition: Profiling.cpp:489

TensorHandle.hpp

armnnUtils::Processes::GetCurrentId
int GetCurrentId()
Definition: Processes.cpp:19

ARMNN_NO_DEPRECATE_WARN_BEGIN
#define ARMNN_NO_DEPRECATE_WARN_BEGIN
Definition: Deprecated.hpp:33

armnn::TensorInfo
Definition: Tensor.hpp:152

armnn::IWorkloadFactory
Definition: WorkloadFactory.hpp:22

armnn::ITensorHandle::GetImportFlags
virtual unsigned int GetImportFlags() const
Get flags describing supported import sources.
Definition: ITensorHandle.hpp:69

armnn::INetworkProperties::m_AsyncEnabled
const bool m_AsyncEnabled
Definition: IRuntime.hpp:62

armnn::GetOutputTensor
const armnn::Tensor GetOutputTensor(const LayerBindingId layerId, const OutputTensors &outputTensors)
Definition: LoadedNetwork.cpp:1141

armnnSerializer::Layer
Layer
Definition: ArmnnSchema_generated.h:1152

armnn::BindableLayer
Definition: Layer.hpp:431

armnn::profiling::ProfilingGuid
Definition: Types.hpp:327

armnn::experimental::WorkingMemHandle::GetWorkingMemDescriptorAt
WorkingMemDescriptor & GetWorkingMemDescriptorAt(unsigned int id) override
Get the WorkingMemDescriptor at an index.
Definition: WorkingMemHandle.hpp:75

armnn::LayerType::Output

armnn::profiling::ProfilingDynamicGuid
Strongly typed guids to distinguish between those generated at runtime, and those that are statically...
Definition: Types.hpp:371

armnn::ITensorHandle::Allocate
virtual void Allocate()=0
Indicate to the memory manager that this resource is no longer active.

armnn::LoadedNetwork::GetInputTensorInfo
TensorInfo GetInputTensorInfo(LayerBindingId layerId) const
Definition: LoadedNetwork.cpp:386

ARMNN_LOG
#define ARMNN_LOG(severity)
Definition: Logging.hpp:202

armnn::ITensorHandle::Manage
virtual void Manage()=0
Indicate to the memory manager that this resource is active.

armnn::BackendRegistryInstance
BackendRegistry & BackendRegistryInstance()
Definition: BackendRegistry.cpp:13

armnn::InputTensors
std::vector< std::pair< LayerBindingId, class ConstTensor > > InputTensors
Definition: Tensor.hpp:340

armnn::MemorySourceFlags
unsigned int MemorySourceFlags
Definition: MemorySources.hpp:15

armnn::BaseTensor::GetMemoryArea
MemoryType GetMemoryArea() const
Definition: Tensor.hpp:292

armnn::GetTimeNow
std::chrono::high_resolution_clock::time_point GetTimeNow()
Definition: Timer.hpp:14

armnn::Graph::GetNumOutputs
size_t GetNumOutputs() const
Definition: Graph.hpp:181

armnn::LoadedNetwork::GetOutputTensorInfo
TensorInfo GetOutputTensorInfo(LayerBindingId layerId) const
Definition: LoadedNetwork.cpp:400

armnn
Copyright (c) 2021 ARM Limited and Contributors.
Definition: 01_00_software_tools.dox:6

armnn::IgnoreUnused
void IgnoreUnused(Ts &&...)
Definition: IgnoreUnused.hpp:14

armnn::Layer::GetInputSlots
const std::vector< InputSlot > & GetInputSlots() const
Definition: Layer.hpp:237

LoadedNetwork.hpp

armnn::DebugCallbackFunction
std::function< void(LayerGuid guid, unsigned int slotIndex, ITensorHandle *tensorHandle)> DebugCallbackFunction
Define the type of callback for the Debug layer to call.
Definition: Types.hpp:316

armnn::Layer::GetNumOutputSlots
unsigned int GetNumOutputSlots() const override
Returns the number of connectable output slots.
Definition: Layer.hpp:314

armnn::profiling::LabelsAndEventClasses::WORKLOAD_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid WORKLOAD_GUID
Definition: LabelsAndEventClasses.hpp:48

Processes.hpp

armnn::BoostLogSeverityMapping::error

BackendRegistry.hpp

armnn::profiling::LabelsAndEventClasses::ARMNN_PROFILING_EOL_EVENT_CLASS
static ARMNN_DLLEXPORT ProfilingStaticGuid ARMNN_PROFILING_EOL_EVENT_CLASS
Definition: LabelsAndEventClasses.hpp:60

ARMNN_SCOPED_PROFILING_EVENT
#define ARMNN_SCOPED_PROFILING_EVENT(backendId, name)
Definition: Profiling.hpp:173

armnn::LayerBindingId
int LayerBindingId
Type of identifiers for bindable layers (inputs, outputs).
Definition: Types.hpp:243

Timer.hpp

armnn::LoadedNetwork::ExecutionTuple
std::tuple< InputTensors, OutputTensors, std::shared_ptr< IAsyncExecutionCallback > > ExecutionTuple
Definition: LoadedNetwork.hpp:42

armnn::INetworkProperties
Definition: IRuntime.hpp:30

armnn::profiling::LabelsAndEventClasses::ARMNN_PROFILING_SOL_EVENT_CLASS
static ARMNN_DLLEXPORT ProfilingStaticGuid ARMNN_PROFILING_SOL_EVENT_CLASS
Definition: LabelsAndEventClasses.hpp:56

armnn::Tensor
A tensor defined by a TensorInfo (shape and data type) and a mutable backing store.
Definition: Tensor.hpp:306

armnn::LoadedNetwork::Schedule
void Schedule(const InputTensors &inputTensors, const OutputTensors &outputTensors, const QosExecPriority priority, std::shared_ptr< IAsyncExecutionCallback > cb)
Schedule an asynchronous execution on the loaded network.
Definition: LoadedNetwork.cpp:892

armnn::experimental::WorkingMemDescriptor
Definition: WorkingMemDescriptor.hpp:18

armnn::LoadedNetwork::Execute
Status Execute(const InputTensors &inputTensors, const OutputTensors &outputTensors, IWorkingMemHandle &workingMemHandle)
Thread safe execution of the loaded network.
Definition: LoadedNetwork.cpp:1154

armnn::IBackendInternal::CreateWorkloadFactory
virtual IWorkloadFactoryPtr CreateWorkloadFactory(const IMemoryManagerSharedPtr &memoryManager=nullptr) const =0

WorkingMemHandle.hpp

armnn::experimental::WorkingMemDescriptor::m_Inputs
std::vector< ITensorHandle * > m_Inputs
Definition: WorkingMemDescriptor.hpp:20

std
Definition: BackendId.hpp:147

armnn::WorkloadInfo::m_InputTensorInfos
std::vector< TensorInfo > m_InputTensorInfos
Definition: WorkloadInfo.hpp:18

LabelsAndEventClasses.hpp

armnn::Status::Success

armnn::profiling::LabelsAndEventClasses::LAYER_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid LAYER_GUID
Definition: LabelsAndEventClasses.hpp:47

armnn::experimental::WorkingMemHandle::GetWorkingMemDescriptor
WorkingMemDescriptor & GetWorkingMemDescriptor(LayerGuid id) override
Get the WorkingMemDescriptor for a Layer. The mutex must be locked.
Definition: WorkingMemHandle.hpp:66

ARMNN_NO_DEPRECATE_WARN_END
#define ARMNN_NO_DEPRECATE_WARN_END
Definition: Deprecated.hpp:34

ARMNN_ASSERT_MSG
#define ARMNN_ASSERT_MSG(COND, MSG)
Definition: Assert.hpp:15

armnn::IBackendInternal::SupportsTensorAllocatorAPI
bool SupportsTensorAllocatorAPI() const
Definition: IBackendInternal.cpp:156

Graph.hpp

armnn::IBackendInternal::IMemoryManagerSharedPtr
std::shared_ptr< IMemoryManager > IMemoryManagerSharedPtr
Definition: IBackendInternal.hpp:92

armnn::ITensorHandle
Definition: ITensorHandle.hpp:15

armnn::BoostLogSeverityMapping::warning

ARMNN_SCOPED_HEAP_PROFILING
#define ARMNN_SCOPED_HEAP_PROFILING(TAG)
Definition: HeapProfiling.hpp:45

armnn::Compute::Undefined

armnn::profiling::LabelsAndEventClasses::EXECUTION_OF_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid EXECUTION_OF_GUID
Definition: LabelsAndEventClasses.hpp:37

armnn::NetworkId
int NetworkId
Definition: IRuntime.hpp:22

armnn::ConstTensor
A tensor defined by a TensorInfo (shape and data type) and an immutable backing store.
Definition: Tensor.hpp:314

armnn::OutputTensors
std::vector< std::pair< LayerBindingId, class Tensor > > OutputTensors
Definition: Tensor.hpp:341

armnn::Layer::GetNameStr
const std::string & GetNameStr() const
Definition: Layer.hpp:220

armnn::Layer::GetType
LayerType GetType() const override
Returns the armnn::LayerType of this layer.
Definition: Layer.hpp:265

armnn::Status
Status
enumeration
Definition: Types.hpp:30

armnn::OutputSlot
Definition: Layer.hpp:83

armnn::experimental::IWorkingMemHandle
Definition: IWorkingMemHandle.hpp:20

ARMNN_ASSERT
#define ARMNN_ASSERT(COND)
Definition: Assert.hpp:14

armnn::OutputHandler
Definition: OutputHandler.hpp:28

armnn::WorkloadInfo::m_OutputTensorInfos
std::vector< TensorInfo > m_OutputTensorInfos
Definition: WorkloadInfo.hpp:19

armnn::profiling
Definition: BackendId.hpp:168

armnn::IWorkloadFactory::IsLayerSupported
static bool IsLayerSupported(const BackendId &backendId, const IConnectableLayer &layer, Optional< DataType > dataType, std::string &outReasonIfUnsupported)
Definition: WorkloadFactory.cpp:1278

armnn::RuntimeException
Definition: Exceptions.hpp:120

armnn::LoadedNetwork::WorkloadQueue
std::vector< std::unique_ptr< IWorkload > > WorkloadQueue
Definition: LoadedNetwork.hpp:38

armnn::BaseTensor::GetInfo
const TensorInfo & GetInfo() const
Definition: Tensor.hpp:282

armnn::InvalidArgumentException
Definition: Exceptions.hpp:80

CHECK_LOCATION
#define CHECK_LOCATION()
Definition: Exceptions.hpp:197

armnn::Layer::GetBackendId
const BackendId & GetBackendId() const
Definition: Layer.hpp:269

armnn::profiling::ProfilingRelationshipType::RetentionLink

armnn::experimental::WorkingMemHandle::Allocate
void Allocate() override
Allocate the backing memory required for execution.
Definition: WorkingMemHandle.cpp:34

armnn::Graph
Definition: Graph.hpp:29

armnn::Layer::GetOutputSlots
const std::vector< OutputSlot > & GetOutputSlots() const
Definition: Layer.hpp:238

Logging.hpp

armnn::TensorHandleFactoryRegistry
Definition: TensorHandleFactoryRegistry.hpp:20

armnn::experimental::WorkingMemHandle::GetMutex
std::mutex & GetMutex() override
Get a mutex which can be used for synchronizing access to the WorkingMemHandle object.
Definition: WorkingMemHandle.hpp:60

armnn::LoadedNetwork::FreeWorkingMemory
void FreeWorkingMemory()
Definition: LoadedNetwork.cpp:791

armnn::Graph::GetOutputLayers
OutputLayersAccessor GetOutputLayers() const
Returns a wrapper object with begin(), end() methods to iterate over the output layers in a range-bas...
Definition: Graph.hpp:189

armnn::profiling::LabelsAndEventClasses::NETWORK_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid NETWORK_GUID
Definition: LabelsAndEventClasses.hpp:49

armnn::QosExecPriority::High

armnn::MemCopyQueueDescriptor
Definition: WorkloadData.hpp:76

armnn::LoadedNetwork::EnqueueWorkload
Status EnqueueWorkload(const InputTensors &inputTensors, const OutputTensors &outputTensors)
Single thread execution of the loaded network.
Definition: LoadedNetwork.cpp:535

armnn::LayerType::Constant

Assert.hpp

armnn::profiling::ProfilingService
Definition: ProfilingService.hpp:49

Layer.hpp

armnn::LayerType::MemImport

armnn::ProfilerManager::RegisterProfiler
void RegisterProfiler(IProfiler *profiler)
Definition: Profiling.cpp:496

armnn::ITensorHandle::Map
virtual const void * Map(bool blocking=true) const =0
Map the tensor data for access.

armnn::LoadedNetwork
Definition: LoadedNetwork.hpp:35

armnn::LoadedNetwork::GetNetworkGuid
profiling::ProfilingGuid GetNetworkGuid()
Definition: LoadedNetwork.cpp:381

armnn::ITensorHandle::Unmap
virtual void Unmap() const =0
Unmap the tensor data.

armnn::Status::Failure

armnn::experimental::WorkingMemHandle::IsAllocated
bool IsAllocated() override
IsAllocated returns true if the backing memory is currently allocated. The mutex must be locked...
Definition: WorkingMemHandle.hpp:54

armnn::QueueDescriptor::m_Outputs
std::vector< ITensorHandle * > m_Outputs
Definition: WorkloadData.hpp:31

armnn::Exception
Base class for all ArmNN exceptions so that users can filter to just those.
Definition: Exceptions.hpp:46

armnn::BoostLogSeverityMapping::info

armnn::Layer::GetOutputHandler
const OutputHandler & GetOutputHandler(unsigned int i=0) const
Definition: Layer.hpp:225

armnn::BackendId::Get
const std::string & Get() const
Definition: BackendId.hpp:136

armnn::MemoryImportException
Definition: Exceptions.hpp:125

armnn::ITensorHandleFactory
Definition: ITensorHandleFactory.hpp:42

armnn::LoadedNetwork::RegisterDebugCallback
void RegisterDebugCallback(const DebugCallbackFunction &func)
Definition: LoadedNetwork.cpp:1400

Network.hpp

armnn::TensorHandleFactoryRegistry::GetFactory
ITensorHandleFactory * GetFactory(ITensorHandleFactory::FactoryId id) const
Find a TensorHandleFactory by Id Returns nullptr if not found.
Definition: TensorHandleFactoryRegistry.cpp:39

armnn::LayerType::Input

armnn::experimental::WorkingMemDescriptor::m_Outputs
std::vector< ITensorHandle * > m_Outputs
Definition: WorkingMemDescriptor.hpp:21

armnn::ITensorHandleFactory::FactoryId
std::string FactoryId
Definition: ITensorHandleFactory.hpp:45

IMemoryManager.hpp

armnn::MemoryExportException
Definition: Exceptions.hpp:130

armnn::WorkloadInfo
Contains information about inputs and outputs to a layer.
Definition: WorkloadInfo.hpp:16

armnn::QosExecPriority
QosExecPriority
Definition: Types.hpp:60

armnn::TensorHandleFactoryRegistry::GetMemoryManagers
std::vector< std::shared_ptr< IMemoryManager > > & GetMemoryManagers()
Definition: TensorHandleFactoryRegistry.hpp:48

MemSyncWorkload.hpp

armnn::CheckFlag
bool CheckFlag(MemorySourceFlags flags, MemorySource source)
Definition: MemorySources.hpp:41

armnn::CopyTensorContentsGeneric
void CopyTensorContentsGeneric(const ITensorHandle *srcTensor, ITensorHandle *dstTensor, CopyFunc copy)
Definition: WorkloadUtils.hpp:47

armnn::Graph::TopologicalSort
Graph & TopologicalSort()
Sorts layers in topological order and return this.
Definition: Graph.hpp:177

armnn::Graph::GetInputLayers
InputLayersAccessor GetInputLayers() const
Returns a wrapper object with begin(), end() methods to iterate over the input layers in a range-base...
Definition: Graph.hpp:185

armnn::QueueDescriptor::m_Inputs
std::vector< ITensorHandle * > m_Inputs
Definition: WorkloadData.hpp:30

HeapProfiling.hpp

armnn::QosExecPriority::Low

armnn::profiling::LabelsAndEventClasses::PROCESS_ID_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid PROCESS_ID_GUID
Definition: LabelsAndEventClasses.hpp:38

armnn::Graph::GetNumLayers
size_t GetNumLayers() const
Definition: Graph.hpp:191

armnn::IBackendInternal::CreateMemoryManager
virtual ARMNN_NO_DEPRECATE_WARN_END IMemoryManagerUniquePtr CreateMemoryManager() const
Definition: IBackendInternal.cpp:32

armnn::MemSyncQueueDescriptor
Definition: WorkloadData.hpp:89

armnn::GetInputTensor
const armnn::ConstTensor GetInputTensor(const LayerBindingId layerId, const InputTensors &inputTensors)
Definition: LoadedNetwork.cpp:1128

armnn::GetTensorInfo
const TensorInfo & GetTensorInfo(const ITensorHandle *tensorHandle)
float32 helpers
Definition: RefWorkloadUtils.hpp:26

Profiling.hpp

armnn::Graph::GetNumInputs
size_t GetNumInputs() const
Definition: Graph.hpp:180

armnn::ITensorHandleFactory::CreateTensorHandle
virtual std::unique_ptr< ITensorHandle > CreateTensorHandle(const TensorInfo &tensorInfo) const =0

armnn::LoadedNetwork::MakeLoadedNetwork
static std::unique_ptr< LoadedNetwork > MakeLoadedNetwork(std::unique_ptr< IOptimizedNetwork > net, std::string &errorMessage, const INetworkProperties &networkProperties, profiling::ProfilingService &profilingService, const NetworkId networkIdOut)
Definition: LoadedNetwork.cpp:85

armnn::profiling::LabelsAndEventClasses::BACKENDID_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid BACKENDID_GUID
Definition: LabelsAndEventClasses.hpp:35

armnn::QosExecPriority::Medium

armnn::ITensorHandleFactory::LegacyFactoryId
static const FactoryId LegacyFactoryId
Definition: ITensorHandleFactory.hpp:46

armnn::experimental::WorkingMemHandle
Definition: WorkingMemHandle.hpp:23

armnn::Layer
Definition: Layer.hpp:210

armnn::profiling::LabelsAndEventClasses::CHILD_GUID
static ARMNN_DLLEXPORT ProfilingStaticGuid CHILD_GUID
Definition: LabelsAndEventClasses.hpp:36

armnn::BackendId
Definition: BackendId.hpp:75

armnn::Layer::GetGuid
LayerGuid GetGuid() const final
Returns the unique id of the layer.
Definition: Layer.hpp:322

armnn::LoadedNetwork::SendNetworkStructure
void SendNetworkStructure()
Definition: LoadedNetwork.cpp:344