This example is basically a copy of the SimpleSample example. But it makes use of a CustomAllocator to allocate memory for the inputs, outputs and inter layer memory.
#include <arm_compute/core/CL/CLKernelLibrary.h>
#include <arm_compute/runtime/CL/CLScheduler.h>
#include <iostream>
{
public:
SampleClBackendCustomAllocator() = default;
void*
allocate(
size_t size,
size_t alignment)
override
{
if (alignment == 0)
{
alignment = arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
}
size_t space = size + alignment + alignment;
auto allocatedMemPtr = std::malloc(space * sizeof(size_t));
if (std::align(alignment, size, allocatedMemPtr, space) == nullptr)
{
}
return allocatedMemPtr;
}
void free(
void* ptr)
override
{
std::free(ptr);
}
{
}
};
int main()
{
float number;
std::cout << "Please enter a number: " << std::endl;
std::cin >> number;
FullyConnectedDescriptor fullyConnectedDesc;
float weightsData[] = {1.0f};
weightsInfo.SetConstant(true);
ConstTensor weights(weightsInfo, weightsData);
IConnectableLayer* inputLayer = network->AddInputLayer(0);
IConnectableLayer* weightsLayer = network->AddConstantLayer(weights, "Weights");
IConnectableLayer* fullyConnectedLayer =
network->AddFullyConnectedLayer(fullyConnectedDesc, "fully connected");
IConnectableLayer* outputLayer = network->AddOutputLayer(0);
inputLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0));
weightsLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(1));
fullyConnectedLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0));
weightsLayer->GetOutputSlot(0).SetTensorInfo(weightsInfo);
IRuntime::CreationOptions options;
auto customAllocator = std::make_shared<SampleClBackendCustomAllocator>();
options.m_CustomAllocatorMap = {{"GpuAcc", std::move(customAllocator)}};
inputLayer->GetOutputSlot(0).SetTensorInfo(inputTensorInfo);
unsigned int numElements = inputTensorInfo.GetNumElements();
size_t totalBytes = numElements * sizeof(float);
fullyConnectedLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
OptimizerOptionsOpaque optOptions;
optOptions.SetImportEnabled(true);
Optimize(*network, {
"GpuAcc"}, runtime->GetDeviceSpec(), optOptions);
if (!optNet)
{
std::cerr << "Error: Failed to optimise the input network." << std::endl;
return 1;
}
std::string ignoredErrorMessage;
runtime->LoadNetwork(networkIdentifier, std::move(optNet), ignoredErrorMessage, networkProperties);
const size_t alignment =
arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
void* alignedInputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
std::fill_n(inputPtr, numElements, number);
void* alignedOutputPtr = options.m_CustomAllocatorMap["GpuAcc"]->allocate(totalBytes, alignment);
auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
std::fill_n(outputPtr, numElements, -10.0f);
inputTensorInfo = runtime->GetInputTensorInfo(networkIdentifier, 0);
inputTensorInfo.SetConstant(true);
{
{0, ConstTensor(inputTensorInfo, alignedInputPtr)},
};
{
{0, Tensor(runtime->GetOutputTensorInfo(networkIdentifier, 0), alignedOutputPtr)}
};
runtime->EnqueueWorkload(networkIdentifier, inputTensors, outputTensors);
arm_compute::CLScheduler::get().sync();
auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
std::cout << "Your number was " << outputResult[0] << std::endl;
runtime->UnloadNetwork(networkIdentifier);
return 0;
}
Base class for all ArmNN exceptions so that users can filter to just those.
Custom Allocator interface.
virtual void * allocate(size_t size, size_t alignment)=0
Interface to be implemented by the child class to allocate bytes.
virtual void free(void *ptr)=0
Interface to be implemented by the child class to free the allocated bytes.
virtual armnn::MemorySource GetMemorySourceType()=0
Used to specify what type of memory is being allocated by this allocator.
static INetworkPtr Create(const NetworkOptions &networkOptions={})
static IRuntimePtr Create(const CreationOptions &options)
Copyright (c) 2021 ARM Limited and Contributors.
MemorySource
Define the Memory Source to reduce copies.
std::unique_ptr< IRuntime, void(*)(IRuntime *runtime)> IRuntimePtr
std::unique_ptr< IOptimizedNetwork, void(*)(IOptimizedNetwork *network)> IOptimizedNetworkPtr
std::vector< std::pair< LayerBindingId, class Tensor > > OutputTensors
std::vector< std::pair< LayerBindingId, class ConstTensor > > InputTensors
IOptimizedNetworkPtr Optimize(const INetwork &network, const std::vector< BackendId > &backendPreferences, const IDeviceSpec &deviceSpec, const OptimizerOptionsOpaque &options=OptimizerOptionsOpaque(), Optional< std::vector< std::string > & > messages=EmptyOptional())
Create an optimized version of the network.
void ConfigureLogging(bool printToStandardOutput, bool printToDebugOutput, LogSeverity severity)
Configures the logging behaviour of the ARMNN library.
std::unique_ptr< INetwork, void(*)(INetwork *network)> INetworkPtr