ArmNN
 24.08
GpuFsaPreCompiledWorkload.cpp
Go to the documentation of this file.
1 //
2 // Copyright © 2024 Arm Ltd and Contributors. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
9 
11 #include <gpuFsa/GpuFsaBackend.hpp>
13 #include <fmt/format.h>
14 
16 #include <arm_compute/runtime/CL/CLTensor.h>
17 #include <arm_compute/core/ITensorInfo.h>
18 #include <arm_compute/core/TensorInfo.h>
19 #include <arm_compute/core/TensorShape.h>
20 #include <arm_compute/core/CL/CLKernelLibrary.h>
21 #include <arm_compute/core/CL/CLCompileContext.h>
22 
23 #include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
24 #include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
25 #include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
26 #include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
27 #include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
28 
29 namespace armnn {
30 
32  const WorkloadInfo &info)
33  : BaseWorkload<PreCompiledQueueDescriptor>(descriptor, info), m_workloadInfo(info)
34 {
35  // Check that the workload is holding a pointer to a valid pre-compiled object
36  if (m_Data.m_PreCompiledObject == nullptr)
37  {
39  "GpuFsaPrecompiledWorkload requires a valid pre-compiled object (GpuWorkloadSketch).");
40  }
41 }
42 
44 {
45 /*
46  * The Execute function of the GpuFsa Backends PreCompiled workload needs to jump through various hoops in order to
47  * create a valid sketch and runtime that can execute the kernel
48  * First we need all of the data stored within the PreCompiled blob which was used to setup the workload, namely:
49  * The GpuWorkloadContext, this is a context which contains the TensorInfos and is unique to the graph being run
50  * The Sketch, this can contain one or many ops and acts as a subgraph within the context
51  * The inputTensorInfos / outputTensorInfos, These are vectors containing the TensorInfos used when creating the sketch
52  *
53  * It is very important that the Tensors passed into the Runtime being used to execute this sketch are created with
54  * the same TensorInfos as used when creating the sketch. We do this by creating new tensors, getting the original
55  * TensorInfos from the vectors of tensorInfos stored in the blob, and then importing the buffers from our own
56  * TensorHandles directly into these newly created Tensors. This allows us to link the externally visible Tensors
57  * from ArmNN to the Tensors which are needed to execute with the Sketch.
58  *
59  */
60  using namespace arm_compute::experimental::dynamic_fusion;
61  // Get the runtime and configure it with the precompiled sketch
62  ClWorkloadRuntime runtime;
64  auto sketch = preCompiledBlob->sketch.release();
65  auto status = runtime.configure(*sketch);
66 
67  // Get the TensorInfos stored within the PreCompiledBlob and check they're the right size
68  auto inputTensorInfos = preCompiledBlob->inputTensorInfos.get();
69  auto outputTensorInfos = preCompiledBlob->outputTensorInfos.get();
70  if (inputTensorInfos->size() != m_Data.m_Inputs.size())
71  {
72  throw InvalidArgumentException(fmt::format("GpuFsaPreCompiledWorkload::Execute: The number of inputTensorInfos"
73  " {} does not match the number of inputs {}.",
74  inputTensorInfos->size(), m_Data.m_Inputs.size()));
75  }
76  if (outputTensorInfos->size() != m_Data.m_Outputs.size())
77  {
78  throw InvalidArgumentException(fmt::format("GpuFsaPreCompiledWorkload::Execute: The number of outputTensorInfos"
79  " {} does not match the number of outputs {}.",
80  outputTensorInfos->size(), m_Data.m_Outputs.size()));
81  }
82 
83  // (Important) Allocate auxiliary tensor memory if there are any
84  for(auto &data : runtime.get_auxiliary_tensors())
85  {
86  arm_compute::CLTensor* tensor = std::get<0>(data);
87  arm_compute::TensorInfo info = std::get<1>(data);
88  arm_compute::experimental::dynamic_fusion::AuxMemoryInfo aux_mem_req = std::get<2>(data);
89  tensor->allocator()->init(info, aux_mem_req.alignment);
90  tensor->allocator()->allocate(); // Use ACL allocated memory
91  }
92 
93  // Create and initialize user tensors
94  std::vector<arm_compute::CLTensor*> inputsWeightsOutputs;
95  inputsWeightsOutputs.reserve(m_Data.m_Inputs.size() + m_Data.m_Outputs.size());
96 
97  for (uint32_t inputSlotIdx = 0; inputSlotIdx < m_Data.m_Inputs.size(); ++inputSlotIdx)
98  {
99  arm_compute::CLTensor* input = new arm_compute::CLTensor{};
100  // inputTensorInfos is a ptr to a vector of ptrs, so we need to do a double dereference
101  input->allocator()->init(*((*inputTensorInfos)[inputSlotIdx]));
102  auto* inputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Inputs[inputSlotIdx]);
103  input->allocator()->import_memory(inputHandle->GetTensor().cl_buffer());
104  inputsWeightsOutputs.emplace_back(std::move(input));
105  }
106  // Set the outputs
107  for (uint32_t outputSlotIdx = 0; outputSlotIdx < m_Data.m_Outputs.size(); ++outputSlotIdx)
108  {
109  arm_compute::CLTensor* output = new arm_compute::CLTensor{};
110  // outputTensorInfos is a ptr to a vector of ptrs, so we need to do a double dereference
111  output->allocator()->init(*((*outputTensorInfos)[outputSlotIdx]));
112  auto* outputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Outputs[outputSlotIdx]);
113  output->allocator()->import_memory(outputHandle->GetTensor().cl_buffer());
114  inputsWeightsOutputs.emplace_back(std::move(output));
115  }
116  runtime.run(inputsWeightsOutputs);
117 }
118 } // namespace armnn
armnn::PreCompiledQueueDescriptor::m_PreCompiledObject
void * m_PreCompiledObject
Definition: WorkloadData.hpp:519
armnn::PreCompiledQueueDescriptor
Definition: WorkloadData.hpp:512
GpuFsaTensorHandle.hpp
armnn::GpuFsaPreCompiledBlob::inputTensorInfos
std::unique_ptr< std::vector< arm_compute::ITensorInfo * > > inputTensorInfos
Definition: GpuFsaBackend.hpp:37
armnn::WorkloadInfo
Contains information about TensorInfos of a layer.
Definition: WorkloadInfo.hpp:16
PolymorphicDowncast.hpp
armnn::GpuFsaPreCompiledBlob::sketch
std::unique_ptr< arm_compute::experimental::dynamic_fusion::GpuWorkloadSketch > sketch
Definition: GpuFsaBackend.hpp:34
armnn::InvalidArgumentException
Definition: Exceptions.hpp:80
armnn::BoostLogSeverityMapping::info
@ info
armnn::QueueDescriptor::m_Outputs
std::vector< ITensorHandle * > m_Outputs
Definition: WorkloadData.hpp:27
GpuFsaWorkloadUtils.hpp
armnn::BaseWorkload
Definition: Workload.hpp:33
armnn::GpuFsaPreCompiledWorkload::GpuFsaPreCompiledWorkload
GpuFsaPreCompiledWorkload(const PreCompiledQueueDescriptor &descriptor, const WorkloadInfo &info)
Definition: GpuFsaPreCompiledWorkload.cpp:31
GpuFsaBackend.hpp
armnn::BaseWorkload< PreCompiledQueueDescriptor >::m_Data
PreCompiledQueueDescriptor m_Data
Definition: Workload.hpp:89
armnn
Copyright (c) 2021 ARM Limited and Contributors.
Definition: 01_00_quick_start.dox:6
ArmComputeTensorUtils.hpp
armnn::GpuFsaPreCompiledBlob
A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend.
Definition: GpuFsaBackend.hpp:32
GpuFsaPreCompiledWorkload.hpp
armnn::GpuFsaPreCompiledWorkload::Execute
void Execute() const override
Definition: GpuFsaPreCompiledWorkload.cpp:43
armnn::GpuFsaPreCompiledBlob::outputTensorInfos
std::unique_ptr< std::vector< arm_compute::ITensorInfo * > > outputTensorInfos
Definition: GpuFsaBackend.hpp:38
armnn::QueueDescriptor::m_Inputs
std::vector< ITensorHandle * > m_Inputs
Definition: WorkloadData.hpp:26