ComputeLibrary/latest/_cl_workload_runtime_8cpp_source.xhtml

/*

 * Copyright (c) 2022-2024 Arm Limited.

 *

 * SPDX-License-Identifier: MIT

 *

 * Permission is hereby granted, free of charge, to any person obtaining a copy

 * of this software and associated documentation files (the "Software"), to

 * deal in the Software without restriction, including without limitation the

 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

 * sell copies of the Software, and to permit persons to whom the Software is

 * furnished to do so, subject to the following conditions:

 *

 * The above copyright notice and this permission notice shall be included in all

 * copies or substantial portions of the Software.

 *

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 * SOFTWARE.

 */

#include "arm_compute/dynamic_fusion/runtime/gpu/cl/ClWorkloadRuntime.h"


#include "arm_compute/core/experimental/Types.h"

#include "arm_compute/runtime/CL/CLTensor.h"


#include "src/dynamic_fusion/runtime/gpu/cl/ClKernelRuntime.h"

#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSketchImpl.h"

#include "src/dynamic_fusion/sketch/gpu/GpuWorkloadSourceCode.h"

#include "support/Cast.h"


#include <algorithm>


namespace arm_compute

{

namespace experimental

{

namespace dynamic_fusion

{

namespace

{

/** Holder of any auxiliary @ref CLTensor required by a @ref GpuWorkloadSourceCode.

 *

 * @note The tensors are not allocated by default, and require the user to explicitly allocate them using the associated @ref TensorInfo and @ref AuxMemoryInfo

 *

 * @note This data holder must remain valid until the @ref ClWorkloadRuntime that uses it, is out of scope

 */

class ClAuxTensors

{

public:

    /** A view of a single auxiliary data and the associated @ref TensorInfo and @ref AuxMemoryInfo

     */

    struct DataView

    {

        DataView() = default;

        DataView(CLTensor *tensor, const TensorInfo &tensor_info, const AuxMemoryInfo &memory_info)

            : tensor{tensor}, tensor_info{tensor_info}, memory_info{memory_info}

        {

        }

        ~DataView()                                = default;

        DataView(const DataView &other)            = default;

        DataView &operator=(const DataView &other) = default;

        DataView(DataView &&other)                 = default;

        DataView     &operator=(DataView &&other)  = default;

        CLTensor     *tensor{};      /**< Pointer to the auxiliary tensor */

        TensorInfo    tensor_info{}; /**< Associated tensor info */

        AuxMemoryInfo memory_info{}; /**< Memory requirement */

    };


    /** Get views of all auxiliary tensors. This is mainly used for allocating the auxiliary tensors. */

    std::vector<DataView> get_tensors()

    {

        return _tensors;

    }

    std::vector<DataView> get_tensors() const

    {

        return _tensors;

    }


    friend Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code);


private:

    /** Add auxiliary tensor.

     *

     * @param[in] tensor_info @ref ITensorInfo of the auxiliary tensor

     * @param[in] memory_info Memory requirements of the auxiliary tensor

     *

     * @return CLTensor*  Corresponding tensor memory if successfully added, otherwise nullptr

     */

    CLTensor *add_aux_tensor(const ITensorInfo &tensor_info, const AuxMemoryInfo &aux_memory_info)

    {

        const auto t_id             = tensor_info.id();

        auto       find_tensor_pair = _owned_tensors.find(t_id);

        if (find_tensor_pair != _owned_tensors.end())

        {

            return find_tensor_pair->second.get();

        }

        else

        {

            auto tensor        = std::make_unique<CLTensor>();

            auto inserted_pair = _owned_tensors.emplace(t_id, std::move(tensor)).first;

            auto new_tensor    = inserted_pair->second.get();

            _tensors.emplace_back(new_tensor, tensor_info, aux_memory_info);

            return new_tensor;

        }

    }


    std::map<ITensorInfo::Id, std::unique_ptr<CLTensor>> _owned_tensors{};

    std::vector<DataView>                                _tensors{};

};

/** Construct auxiliary tensors required by @ref GpuWorkloadSourceCode

 *

 * @note This is the only recommended method for user to create @ref ClAuxTensors

 *

 * @param[out] aux_tensors Auxiliary tensors required by the workload code

 * @param[in]  code        @ref GpuWorkloadSourceCode which all tensors bind to

 *

 * @return Status

 */

Status create_aux_tensors(ClAuxTensors *aux_tensors, const GpuWorkloadSourceCode &code)

{

    for (auto t_id : code.tensors())

    {

        // Get tensor object

        const auto workload_arg  = code.query_tensor(t_id);

        ICLTensor *tensor_object = nullptr;

        if (workload_arg->memory_descriptor()->memory_type == MemoryType::Auxiliary)

        {

            // Create aux tensor CLTensor object

            const TensorInfo tensor_info = *workload_arg->tensor_info();

            ARM_COMPUTE_ERROR_ON(tensor_info.id() != t_id);

            const auto aux_memory_info = workload_arg->memory_descriptor()->aux_memory_info;

            tensor_object              = aux_tensors->add_aux_tensor(tensor_info, aux_memory_info);


            if (tensor_object == nullptr)

            {

                return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Failed to construct an auxiliary tensor");

            }

        }

    }

    return Status{};

}


/** A fast tensor lookup table for runtime tensor objects retrieval

 */

class ClTensorLUT

{

public:

    /** Find a tensor pack associated with the @ref UnitWorkloadId @p uwk_id

     *

     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack

     *

     * @return ITensorPack*

     */

    ITensorPack *find_tensor_pack(UnitWorkloadId uwk_id)

    {

        auto tensor_pack = _tensor_packs.find(uwk_id);

        if (tensor_pack != _tensor_packs.end())

        {

            return &(tensor_pack->second);

        }

        return nullptr;

    }

    /** Get a tensor pack associated with @p uwk_id. Throws a exception if it cannot be found.

     *

     * @param[in] uwk_id @ref UnitWorkloadId associated with the tensor pack

     *

     * @return ITensorPack*

     */

    ITensorPack &get_tensor_pack(UnitWorkloadId uwk_id)

    {

        return _tensor_packs.at(uwk_id);

    }


    friend Status create_tensor_lut(ClTensorLUT                   *tensor_lut,

                                    const GpuWorkloadSourceCode   &code,

                                    const std::vector<CLTensor *> &user_tensors,

                                    const ClAuxTensors            &aux_tensors);


private:

    /** Add a tensor pack and associate it with @ref UnitWorkloadId @p uwk_id

     *

     * @param[in] uwk_id      @ref UnitWorkloadId associated with the tensor pack

     * @param[in] tensor_pack Tensor pack to be added

     */

    void add_tensor_pack(UnitWorkloadId uwk_id, const ITensorPack &tensor_pack)

    {

        _tensor_packs[uwk_id] = tensor_pack;

    }

    std::map<UnitWorkloadId, ITensorPack> _tensor_packs{};

};


/** Create a fast tensor lookup table for runtime tensor retrieval

 *

 * @param[out] tensor_lut   @ref ClTensorLUT used by the runtime to feed tensor memories to underlying kernels

 * @param[in]  code         @ref GpuWorkloadSourceCode which all tensors bind to

 * @param[in]  user_tensors User tensors

 * @param[in]  aux_tensors  Auxiliary tensors required by the workload code

 *

 * @return Status

 */

Status create_tensor_lut(ClTensorLUT                   *tensor_lut,

                         const GpuWorkloadSourceCode   &code,

                         const std::vector<CLTensor *> &user_tensors,

                         const ClAuxTensors            &aux_tensors)

{

    // Combine user tensors and aux tensors

    std::map<ITensorInfo::Id, CLTensor *> tensor_map;

    for (auto tensor : user_tensors)

    {

        const auto t_id = tensor->info()->id();


        if (tensor_map.find(t_id) != tensor_map.end())

        {

            // In case of elementwise in-place: give another Id to the In/Out tensor when passed again

            std::vector<ITensorInfo::Id> ids;

            for (auto &t : tensor_map)

            {

                ids.push_back(t.first);

            }

            ITensorInfo::Id new_id = *std::max_element(ids.begin(), ids.end()) + 1;

            tensor_map[new_id]     = tensor;

        }

        else

        {

            tensor_map[t_id] = tensor;

        }

    }

    for (const auto &data : aux_tensors.get_tensors())

    {

        const auto t_id   = data.tensor_info.id();

        const auto tensor = data.tensor;

        if (tensor_map.find(t_id) != tensor_map.end())

        {

            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Clashing tensor ids");

        }

        tensor_map[t_id] = tensor;

    }


    // Add tensor objects into corresponding tensor packs

    for (auto id_tensor : tensor_map)

    {

        const auto t_id          = id_tensor.first;

        const auto tensor_object = id_tensor.second;

        if (tensor_object == nullptr)

        {

            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Trying to add a nullptr into the tensor packs");

        }

        if (tensor_object->allocator()->info().total_size() == 0U)

        {

            return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "No allocated memory found in tensor");

        }


        for (auto uwk_id : code.get_unit_workloads_from_tensor(t_id))

        {

            ITensorPack *tensor_pack = tensor_lut->find_tensor_pack(uwk_id);

            if (tensor_pack == nullptr)

            {

                tensor_lut->add_tensor_pack(uwk_id, ITensorPack{{t_id, tensor_object}});

            }

            else

            {

                tensor_pack->add_tensor(t_id, tensor_object);

            }

        }

    }


    return Status{};

}


} // namespace


struct ClWorkloadRuntime::Implementation

{

    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels{};

    std::map<UnitWorkloadId, std::unique_ptr<ClKernelRuntime>> _kernels_prep{};

    bool                                                       _is_configured{false};

    bool                                                       _is_prepared{false};

    ClTensorLUT                                                _tensor_lut{};

    ClAuxTensors                                               _aux_tensors{};

    GpuWorkloadSourceCode                                      _source_code{};

};


ClWorkloadRuntime::ClWorkloadRuntime() : _impl{std::make_unique<Implementation>()}

{

}


ClWorkloadRuntime::~ClWorkloadRuntime() = default;


ClWorkloadRuntime::ClWorkloadRuntime(ClWorkloadRuntime &&) = default;


ClWorkloadRuntime &ClWorkloadRuntime::operator=(ClWorkloadRuntime &&) = default;


Status ClWorkloadRuntime::configure(const GpuWorkloadSketch &sketch)

{

    ARM_COMPUTE_RETURN_ERROR_ON_MSG(_impl->_is_configured, "ClWorkloadRuntime cannot be re-configured");

    ARM_COMPUTE_RETURN_ERROR_ON_MSG(sketch.gpu_context()->gpu_language() != GpuLanguage::OpenCL,

                                    "ClWorkloadRuntime cannot be configured with non-OpenCL workload sketch");

    // Generate source code

    _impl->_source_code = sketch.implementation().generate_source_code();

    // Configure unit workload from source code

    for (auto uwk_id : _impl->_source_code.unit_workloads())

    {

        const auto work  = _impl->_source_code.query_unit_workload(uwk_id);

        const auto stage = work.stage().stage;

        auto       k     = std::make_unique<ClKernelRuntime>();

        k->configure(*sketch.gpu_context()->cl_compile_context(), work.code());


        switch (stage)

        {

            case UnitWorkloadStage::Stage::Run:

            {

                _impl->_kernels.emplace(work.id(), std::move(k));

                break;

            }

            case UnitWorkloadStage::Stage::Prepare:

            {

                _impl->_kernels_prep.emplace(work.id(), std::move(k));

                break;

            }

            default:

            {

                ARM_COMPUTE_ERROR("Invalid unit workload stage");

            }

        }

    }

    // Create auxiliary tensor objects

    create_aux_tensors(&_impl->_aux_tensors, _impl->_source_code);

    _impl->_is_configured = true;

    return Status{};

}


void ClWorkloadRuntime::prepare()

{

    if (!_impl->_is_prepared)

    {

        for (auto &id_kernel_pair : _impl->_kernels_prep)

        {

            const bool flush_queue = false;

            const auto uwk_id      = id_kernel_pair.first;

            auto       kernel      = id_kernel_pair.second.get();

            CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);

        }


        _impl->_is_prepared = true;

    }

}


Status ClWorkloadRuntime::run(const std::vector<CLTensor *> &tensors)

{

    // Need to create the tensor lut in every run, unless the user can guarantee the binding remains fixed,

    // in which case the lut can be cached during prepare

    const auto st = create_tensor_lut(&_impl->_tensor_lut, _impl->_source_code, tensors, _impl->_aux_tensors);

    ARM_COMPUTE_RETURN_ON_ERROR(st);

    prepare();

    for (auto &id_kernel_pair : _impl->_kernels)

    {

        // Flush the command queue on the last kernel

        const bool flush_queue = false;

        const auto uwk_id      = id_kernel_pair.first;

        auto       kernel      = id_kernel_pair.second.get();

        CLScheduler::get().enqueue_op(*kernel, _impl->_tensor_lut.get_tensor_pack(uwk_id), flush_queue);

    }

    return Status{};

}


std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> ClWorkloadRuntime::get_auxiliary_tensors()

{

    std::vector<std::tuple<CLTensor *, TensorInfo, AuxMemoryInfo>> aux_tensors;

    for (const auto &data : _impl->_aux_tensors.get_tensors())

    {

        aux_tensors.emplace_back(data.tensor, data.tensor_info, data.memory_info);

    }

    return aux_tensors;

}

} // namespace dynamic_fusion

} // namespace experimental

} // namespace arm_compute