ArmNN
 24.02
GpuFsaPreCompiledWorkload Class Reference

#include <GpuFsaPreCompiledWorkload.hpp>

Inheritance diagram for GpuFsaPreCompiledWorkload:
[legend]
Collaboration diagram for GpuFsaPreCompiledWorkload:
[legend]

Public Member Functions

 GpuFsaPreCompiledWorkload (const PreCompiledQueueDescriptor &descriptor, const WorkloadInfo &info)
 
void Execute () const override
 
- Public Member Functions inherited from BaseWorkload< PreCompiledQueueDescriptor >
 BaseWorkload (const PreCompiledQueueDescriptor &descriptor, const WorkloadInfo &info)
 
virtual const std::string & GetName () const override
 
void ExecuteAsync (ExecutionData &executionData) override
 
void PostAllocationConfigure () override
 
const PreCompiledQueueDescriptorGetData () const
 
arm::pipe::ProfilingGuid GetGuid () const final
 
virtual bool SupportsTensorHandleReplacement () const override
 
void ReplaceInputTensorHandle (ITensorHandle *tensorHandle, unsigned int slot) override
 
void ReplaceOutputTensorHandle (ITensorHandle *tensorHandle, unsigned int slot) override
 
- Public Member Functions inherited from IWorkload
virtual ~IWorkload ()
 
virtual arm::pipe::ProfilingGuid GetGuid () const =0
 
virtual const std::string & GetName () const =0
 
virtual void RegisterDebugCallback (const DebugCallbackFunction &)
 
virtual armnn::Optional< armnn::MemoryRequirementsGetMemoryRequirements ()
 

Additional Inherited Members

- Protected Attributes inherited from BaseWorkload< PreCompiledQueueDescriptor >
PreCompiledQueueDescriptor m_Data
 
const arm::pipe::ProfilingGuid m_Guid
 
const std::string m_Name
 

Detailed Description

Definition at line 30 of file GpuFsaPreCompiledWorkload.hpp.

Constructor & Destructor Documentation

◆ GpuFsaPreCompiledWorkload()

GpuFsaPreCompiledWorkload ( const PreCompiledQueueDescriptor descriptor,
const WorkloadInfo info 
)

Definition at line 31 of file GpuFsaPreCompiledWorkload.cpp.

33  : BaseWorkload<PreCompiledQueueDescriptor>(descriptor, info), m_workloadInfo(info)
34 {
35  // Check that the workload is holding a pointer to a valid pre-compiled object
36  if (m_Data.m_PreCompiledObject == nullptr)
37  {
38  throw InvalidArgumentException(
39  "GpuFsaPrecompiledWorkload requires a valid pre-compiled object (GpuWorkloadSketch).");
40  }
41 }

References armnn::info, BaseWorkload< PreCompiledQueueDescriptor >::m_Data, and PreCompiledQueueDescriptor::m_PreCompiledObject.

Member Function Documentation

◆ Execute()

void Execute ( ) const
overridevirtual

Implements IWorkload.

Definition at line 43 of file GpuFsaPreCompiledWorkload.cpp.

44 {
45 /*
46  * The Execute function of the GpuFsa Backends PreCompiled workload needs to jump through various hoops in order to
47  * create a valid sketch and runtime that can execute the kernel
48  * First we need all of the data stored within the PreCompiled blob which was used to setup the workload, namely:
49  * The GpuWorkloadContext, this is a context which contains the TensorInfos and is unique to the graph being run
50  * The Sketch, this can contain one or many ops and acts as a subgraph within the context
51  * The inputTensorInfos / outputTensorInfos, These are vectors containing the TensorInfos used when creating the sketch
52  *
53  * It is very important that the Tensors passed into the Runtime being used to execute this sketch are created with
54  * the same TensorInfos as used when creating the sketch. We do this by creating new tensors, getting the original
55  * TensorInfos from the vectors of tensorInfos stored in the blob, and then importing the buffers from our own
56  * TensorHandles directly into these newly created Tensors. This allows us to link the externally visible Tensors
57  * from ArmNN to the Tensors which are needed to execute with the Sketch.
58  *
59  */
60  using namespace arm_compute::experimental::dynamic_fusion;
61  // Get the runtime and configure it with the precompiled sketch
62  ClWorkloadRuntime runtime;
63  GpuFsaPreCompiledBlob *preCompiledBlob = static_cast<GpuFsaPreCompiledBlob*>(m_Data.m_PreCompiledObject);
64  auto sketch = preCompiledBlob->sketch.release();
65  auto status = runtime.configure(*sketch);
66 
67  // Get the TensorInfos stored within the PreCompiledBlob and check they're the right size
68  auto inputTensorInfos = preCompiledBlob->inputTensorInfos.get();
69  auto outputTensorInfos = preCompiledBlob->outputTensorInfos.get();
70  if (inputTensorInfos->size() != m_Data.m_Inputs.size())
71  {
72  throw InvalidArgumentException(fmt::format("GpuFsaPreCompiledWorkload::Execute: The number of inputTensorInfos"
73  " {} does not match the number of inputs {}.",
74  inputTensorInfos->size(), m_Data.m_Inputs.size()));
75  }
76  if (outputTensorInfos->size() != m_Data.m_Outputs.size())
77  {
78  throw InvalidArgumentException(fmt::format("GpuFsaPreCompiledWorkload::Execute: The number of outputTensorInfos"
79  " {} does not match the number of outputs {}.",
80  outputTensorInfos->size(), m_Data.m_Outputs.size()));
81  }
82 
83  // (Important) Allocate auxiliary tensor memory if there are any
84  for(auto &data : runtime.get_auxiliary_tensors())
85  {
86  arm_compute::CLTensor* tensor = std::get<0>(data);
87  arm_compute::TensorInfo info = std::get<1>(data);
88  arm_compute::experimental::dynamic_fusion::AuxMemoryInfo aux_mem_req = std::get<2>(data);
89  tensor->allocator()->init(info, aux_mem_req.alignment);
90  tensor->allocator()->allocate(); // Use ACL allocated memory
91  }
92 
93  // Create and initialize user tensors
94  std::vector<arm_compute::CLTensor*> inputsWeightsOutputs;
95  inputsWeightsOutputs.reserve(m_Data.m_Inputs.size() + m_Data.m_Outputs.size());
96 
97  for (uint32_t inputSlotIdx = 0; inputSlotIdx < m_Data.m_Inputs.size(); ++inputSlotIdx)
98  {
99  arm_compute::CLTensor* input = new arm_compute::CLTensor{};
100  // inputTensorInfos is a ptr to a vector of ptrs, so we need to do a double dereference
101  input->allocator()->init(*((*inputTensorInfos)[inputSlotIdx]));
102  auto* inputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Inputs[inputSlotIdx]);
103  input->allocator()->import_memory(inputHandle->GetTensor().cl_buffer());
104  inputsWeightsOutputs.emplace_back(std::move(input));
105  }
106  // Set the outputs
107  for (uint32_t outputSlotIdx = 0; outputSlotIdx < m_Data.m_Outputs.size(); ++outputSlotIdx)
108  {
109  arm_compute::CLTensor* output = new arm_compute::CLTensor{};
110  // outputTensorInfos is a ptr to a vector of ptrs, so we need to do a double dereference
111  output->allocator()->init(*((*outputTensorInfos)[outputSlotIdx]));
112  auto* outputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(m_Data.m_Outputs[outputSlotIdx]);
113  output->allocator()->import_memory(outputHandle->GetTensor().cl_buffer());
114  inputsWeightsOutputs.emplace_back(std::move(output));
115  }
116  runtime.run(inputsWeightsOutputs);
117 }

References armnn::info, GpuFsaPreCompiledBlob::inputTensorInfos, BaseWorkload< PreCompiledQueueDescriptor >::m_Data, QueueDescriptor::m_Inputs, QueueDescriptor::m_Outputs, PreCompiledQueueDescriptor::m_PreCompiledObject, GpuFsaPreCompiledBlob::outputTensorInfos, and GpuFsaPreCompiledBlob::sketch.


The documentation for this class was generated from the following files:
armnn::PreCompiledQueueDescriptor::m_PreCompiledObject
void * m_PreCompiledObject
Definition: WorkloadData.hpp:519
armnn::BoostLogSeverityMapping::info
@ info
armnn::QueueDescriptor::m_Outputs
std::vector< ITensorHandle * > m_Outputs
Definition: WorkloadData.hpp:27
armnn::BaseWorkload< PreCompiledQueueDescriptor >::m_Data
PreCompiledQueueDescriptor m_Data
Definition: Workload.hpp:89
armnn::QueueDescriptor::m_Inputs
std::vector< ITensorHandle * > m_Inputs
Definition: WorkloadData.hpp:26