armnn/24.02/_gpu_fsa_backend_8hpp_source.html

//

// Copyright © 2022-2024 Arm Ltd and Contributors. All rights reserved.

// SPDX-License-Identifier: MIT

//

#pragma once


#include <armnn/backends/IBackendInternal.hpp>

#include <aclCommon/BaseMemoryManager.hpp>


#include <arm_compute/runtime/CL/CLBufferAllocator.h>

#include <arm_compute/runtime/CL/CLMemoryRegion.h>

#include <arm_compute/core/CL/CLKernelLibrary.h>

#include <CL/cl_ext.h>

#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadContext.h>

#include <arm_compute/dynamic_fusion/sketch/gpu/GpuWorkloadSketch.h>


// System includes for mapping and unmapping memory

#include <sys/mman.h>


namespace armnn

{


/**

 * A structure which contains all the elements needed to execute a fused workload in the GpuFsa Backend

 *

 * @param[in, out]  sketch              A unique pointer to the sketch containing the operators which have been fused.

 * @param[in, out]  TensorInfos         A shared pointer to a GpuWorkloadContext which creates + stores TensorInfos

 * @param[in, out]  inputTensorInfos    A unique pointer to a vector of inputTensorInfos used by the sketch

 * @param[in, out]  outputTensorInfos   A unique pointer to a vector of outputTensorInfos used by the sketch

 *

 */

struct GpuFsaPreCompiledBlob

{

    std::unique_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadSketch> sketch = nullptr;

    std::shared_ptr<arm_compute::experimental::dynamic_fusion::GpuWorkloadContext> workloadContext = nullptr;


    std::unique_ptr<std::vector<arm_compute::ITensorInfo*>> inputTensorInfos = nullptr;

    std::unique_ptr<std::vector<arm_compute::ITensorInfo*>> outputTensorInfos = nullptr;

};


// add new capabilities here..

const BackendCapabilities gpuFsaCapabilities("GpuFsa",

                                             {

                                                     {"NonConstWeights", false},

                                                     {"AsyncExecution", false},

                                                     {"ProtectedContentAllocation", false},

                                                     {"ConstantTensorsAsInputs", true},

                                                     {"PreImportIOTensors", false},

                                                     {"ExternallyManagedMemory", false},

                                                     {"MultiAxisPacking", false},

                                                     {"SingleAxisPacking", false}

                                             });


class GpuFsaBackend : public IBackendInternal

{

public:

    GpuFsaBackend() : m_CustomAllocator(nullptr) {};

    GpuFsaBackend(std::shared_ptr<ICustomAllocator> allocator)

    {

        UseCustomMemoryAllocator(allocator, armnn::EmptyOptional());

    }

    ~GpuFsaBackend() = default;


    static const BackendId& GetIdStatic();

    const BackendId& GetId() const override { return GetIdStatic(); }


    IBackendInternal::IMemoryManagerUniquePtr CreateMemoryManager() const override;


    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(

        const IBackendInternal::IMemoryManagerSharedPtr& memoryManager = nullptr) const override;


    IBackendInternal::IWorkloadFactoryPtr CreateWorkloadFactory(TensorHandleFactoryRegistry& registry) const override;


    IWorkloadFactoryPtr CreateWorkloadFactory(class TensorHandleFactoryRegistry& tensorHandleFactoryRegistry,

                                              const ModelOptions& modelOptions,

                                              MemorySourceFlags inputFlags,

                                              MemorySourceFlags outputFlags) const override;


    std::vector<ITensorHandleFactory::FactoryId> GetHandleFactoryPreferences() const override;


    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry) override;


    void RegisterTensorHandleFactories(TensorHandleFactoryRegistry& registry,

                                       MemorySourceFlags inputFlags,

                                       MemorySourceFlags outputFlags) override;


    IBackendInternal::IBackendContextPtr CreateBackendContext(const IRuntime::CreationOptions&) const override;

    IBackendInternal::IBackendProfilingContextPtr CreateBackendProfilingContext(

        const IRuntime::CreationOptions&, IBackendProfilingPtr& backendProfiling) override;


    IBackendInternal::ILayerSupportSharedPtr GetLayerSupport() const override;


    OptimizationViews OptimizeSubgraphView(const SubgraphView& subgraph,

                                           const ModelOptions& modelOptions) const override;


    std::unique_ptr<ICustomAllocator> GetDefaultAllocator() const override;


    BackendCapabilities GetCapabilities() const override

    {

        return gpuFsaCapabilities;

    };


    virtual bool UseCustomMemoryAllocator(std::shared_ptr<ICustomAllocator> allocator,

                                          armnn::Optional<std::string&>) override

    {

        ARMNN_LOG(info) << "Using Custom Allocator for GpuFsaBackend";


        // Set flag to signal the backend to use a custom memory allocator

        m_CustomAllocator = std::make_shared<GpuFsaBackendCustomAllocatorWrapper>(std::move(allocator));

        m_UsingCustomAllocator = true;

        return m_UsingCustomAllocator;

    }


    // Cl requires a arm_compute::IAllocator we wrap the Arm NN ICustomAllocator to achieve this

    class GpuFsaBackendCustomAllocatorWrapper : public arm_compute::IAllocator

    {

    public:

        GpuFsaBackendCustomAllocatorWrapper(std::shared_ptr<ICustomAllocator> alloc) : m_CustomAllocator(alloc)

        {}

        // Inherited methods overridden:

        void* allocate(size_t size, size_t alignment) override

        {

            auto alloc = m_CustomAllocator->allocate(size, alignment);

            return MapAllocatedMemory(alloc, size, m_CustomAllocator->GetMemorySourceType());

        }

        void free(void* ptr) override

        {

            auto hostMemPtr = m_AllocatedBufferMappings[ptr];

            clReleaseMemObject(static_cast<cl_mem>(ptr));

            m_CustomAllocator->free(hostMemPtr);

        }

        std::unique_ptr<arm_compute::IMemoryRegion> make_region(size_t size, size_t alignment) override

        {

            auto hostMemPtr = m_CustomAllocator->allocate(size, alignment);

            cl_mem buffer = MapAllocatedMemory(hostMemPtr, size, m_CustomAllocator->GetMemorySourceType());


            return std::make_unique<ClBackendCustomAllocatorMemoryRegion>(cl::Buffer(buffer),

                                                                          hostMemPtr,

                                                                          m_CustomAllocator->GetMemorySourceType());

        }

    private:

        cl_mem MapAllocatedMemory(void* memory, size_t size, MemorySource source)

        {

            // Round the size of the buffer to a multiple of the CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE

            auto cachelineAlignment =

                    arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();

            auto roundedSize = cachelineAlignment + size - (size % cachelineAlignment);


            if (source == MemorySource::Malloc)

            {

                const cl_import_properties_arm importProperties[] =

                        {

                            CL_IMPORT_TYPE_ARM,

                            CL_IMPORT_TYPE_HOST_ARM,

                            0

                        };

                cl_int error = CL_SUCCESS;

                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),

                                                  CL_MEM_READ_WRITE,

                                                  importProperties,

                                                  memory,

                                                  roundedSize,

                                                  &error);

                if (error == CL_SUCCESS)

                {

                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));

                    return buffer;

                }

                throw armnn::Exception(

                    "Mapping allocated memory from CustomMemoryAllocator failed, errcode: " + std::to_string(error));

            }

            else if (source == MemorySource::DmaBuf)

            {

                const cl_import_properties_arm importProperties[] =

                        {

                            CL_IMPORT_TYPE_ARM,

                            CL_IMPORT_TYPE_DMA_BUF_ARM,

                            CL_IMPORT_DMA_BUF_DATA_CONSISTENCY_WITH_HOST_ARM,

                            CL_TRUE,

                            0

                        };

                cl_int error = CL_SUCCESS;

                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),

                                                  CL_MEM_READ_WRITE,

                                                  importProperties,

                                                  memory,

                                                  roundedSize,

                                                  &error);

                if (error == CL_SUCCESS)

                {

                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));

                    return buffer;

                }

                throw armnn::Exception(

                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "

                         + std::to_string(error));

            }

            else if (source == MemorySource::DmaBufProtected)

            {

                const cl_import_properties_arm importProperties[] =

                        {

                                CL_IMPORT_TYPE_ARM,

                                CL_IMPORT_TYPE_DMA_BUF_ARM,

                                CL_IMPORT_TYPE_PROTECTED_ARM,

                                CL_TRUE,

                                0

                        };

                cl_int error = CL_SUCCESS;

                cl_mem buffer = clImportMemoryARM(arm_compute::CLKernelLibrary::get().context().get(),

                                                  CL_MEM_READ_WRITE,

                                                  importProperties,

                                                  memory,

                                                  roundedSize,

                                                  &error);

                if (error == CL_SUCCESS)

                {

                    m_AllocatedBufferMappings.insert(std::make_pair(static_cast<void *>(buffer), memory));

                    return buffer;

                }

                throw armnn::Exception(

                        "Mapping allocated memory from CustomMemoryAllocator failed, errcode: "

                         + std::to_string(error));

            }

            throw armnn::Exception(

                    "Attempting to allocate memory with unsupported MemorySource type in CustomAllocator");

        }

        std::shared_ptr<ICustomAllocator> m_CustomAllocator;

        std::map<void*, void*> m_AllocatedBufferMappings;

    };


    class ClBackendCustomAllocatorMemoryRegion : public arm_compute::ICLMemoryRegion

    {

    public:

        // We need to have a new version of ICLMemoryRegion which holds a hostMemPtr to allow for cpu copy access

        ClBackendCustomAllocatorMemoryRegion(const cl::Buffer &buffer, void* hostMemPtr, armnn::MemorySource source)

            : ICLMemoryRegion(buffer.getInfo<CL_MEM_SIZE>())

        {

            _mem = buffer;

            m_HostMemPtr = hostMemPtr;

            m_MemorySource = source;

        }


        // Inherited methods overridden :

        void* ptr() override

        {

            return nullptr;

        }


        void* map(cl::CommandQueue &q, bool blocking) override

        {

            armnn::IgnoreUnused(q, blocking);

            if (m_HostMemPtr == nullptr)

            {

                throw armnn::Exception("ClBackend: Attempting to map memory with an invalid host ptr");

            }

            if (_mapping != nullptr)

            {

                throw armnn::Exception("ClBackend: Attempting to map memory which has not yet been unmapped");

            }

            switch (m_MemorySource)

            {

                case armnn::MemorySource::Malloc:

                    _mapping = m_HostMemPtr;

                    return _mapping;

                    break;

                case armnn::MemorySource::DmaBuf:

                case armnn::MemorySource::DmaBufProtected:

                    // If the source is a Dmabuf then the memory ptr should be pointing to an integer value for the fd

                    _mapping = mmap(NULL, _size, PROT_WRITE, MAP_SHARED, *(reinterpret_cast<int*>(m_HostMemPtr)), 0);

                    return _mapping;

                    break;

                default:

                    throw armnn::Exception("ClBackend: Attempting to map imported memory without a valid source");

                    break;

            }

        }


        void unmap(cl::CommandQueue &q) override

        {

            armnn::IgnoreUnused(q);

            switch (m_MemorySource)

            {

                case armnn::MemorySource::Malloc:

                    _mapping = nullptr;

                    break;

                case armnn::MemorySource::DmaBuf:

                case armnn::MemorySource::DmaBufProtected:

                    munmap(_mapping, _size);

                    _mapping = nullptr;

                    break;

                default:

                    throw armnn::Exception("ClBackend: Attempting to unmap imported memory without a valid source");

                    break;

            }

        }

    private:

        void* m_HostMemPtr = nullptr;

        armnn::MemorySource m_MemorySource;

    };


    std::shared_ptr<GpuFsaBackendCustomAllocatorWrapper> m_CustomAllocator;

    bool m_UsingCustomAllocator = false;

};


} // namespace armnn