13 #include <fmt/format.h>
16 #include <arm_compute/runtime/CL/CLTensor.h>
17 #include <arm_compute/core/ITensorInfo.h>
18 #include <arm_compute/core/TensorInfo.h>
19 #include <arm_compute/core/TensorShape.h>
20 #include <arm_compute/core/CL/CLKernelLibrary.h>
21 #include <arm_compute/core/CL/CLCompileContext.h>
23 #include <arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h>
24 #include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>
25 #include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuConv2d.h>
26 #include <arm_compute/dynamic_fusion/sketch/gpu/operators/GpuOutput.h>
27 #include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>
39 "GpuFsaPrecompiledWorkload requires a valid pre-compiled object (GpuWorkloadSketch).");
60 using namespace arm_compute::experimental::dynamic_fusion;
62 ClWorkloadRuntime runtime;
64 auto sketch = preCompiledBlob->
sketch.release();
65 auto status = runtime.configure(*sketch);
73 " {} does not match the number of inputs {}.",
79 " {} does not match the number of outputs {}.",
84 for(
auto &data : runtime.get_auxiliary_tensors())
86 arm_compute::CLTensor* tensor = std::get<0>(data);
87 arm_compute::TensorInfo
info = std::get<1>(data);
88 arm_compute::experimental::dynamic_fusion::AuxMemoryInfo aux_mem_req = std::get<2>(data);
89 tensor->allocator()->init(
info, aux_mem_req.alignment);
90 tensor->allocator()->allocate();
94 std::vector<arm_compute::CLTensor*> inputsWeightsOutputs;
97 for (uint32_t inputSlotIdx = 0; inputSlotIdx <
m_Data.
m_Inputs.size(); ++inputSlotIdx)
99 arm_compute::CLTensor* input =
new arm_compute::CLTensor{};
101 input->allocator()->init(*((*inputTensorInfos)[inputSlotIdx]));
102 auto* inputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(
m_Data.
m_Inputs[inputSlotIdx]);
103 input->allocator()->import_memory(inputHandle->GetTensor().cl_buffer());
104 inputsWeightsOutputs.emplace_back(std::move(input));
107 for (uint32_t outputSlotIdx = 0; outputSlotIdx <
m_Data.
m_Outputs.size(); ++outputSlotIdx)
109 arm_compute::CLTensor* output =
new arm_compute::CLTensor{};
111 output->allocator()->init(*((*outputTensorInfos)[outputSlotIdx]));
112 auto* outputHandle = PolymorphicDowncast<GpuFsaTensorHandle*>(
m_Data.
m_Outputs[outputSlotIdx]);
113 output->allocator()->import_memory(outputHandle->GetTensor().cl_buffer());
114 inputsWeightsOutputs.emplace_back(std::move(output));
116 runtime.run(inputsWeightsOutputs);