ComputeLibrary/latest/_n_e_o_n_2_g_e_m_m_8cpp_source.xhtml

/*

 * Copyright (c) 2017-2023 Arm Limited.

 *

 * SPDX-License-Identifier: MIT

 *

 * Permission is hereby granted, free of charge, to any person obtaining a copy

 * of this software and associated documentation files (the "Software"), to

 * deal in the Software without restriction, including without limitation the

 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

 * sell copies of the Software, and to permit persons to whom the Software is

 * furnished to do so, subject to the following conditions:

 *

 * The above copyright notice and this permission notice shall be included in all

 * copies or substantial portions of the Software.

 *

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 * SOFTWARE.

 */

#include "arm_compute/core/Types.h"

#include "arm_compute/core/utils/StringUtils.h"

#include "arm_compute/runtime/NEON/functions/NEGEMM.h"

#include "arm_compute/runtime/Tensor.h"

#include "arm_compute/runtime/TensorAllocator.h"

#include "src/core/helpers/MemoryHelpers.h"

#include "src/cpu/kernels/CpuGemmInterleave4x4Kernel.h"

#include "src/cpu/kernels/CpuGemmMatrixMultiplyKernel.h"

#include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"

#include "src/cpu/operators/CpuGemm.h"

#include "tests/NEON/Accessor.h"

#include "tests/NEON/Helper.h"

#include "tests/PaddingCalculator.h"

#include "tests/datasets/LargeGEMMDataset.h"

#include "tests/datasets/SmallGEMMDataset.h"

#include "tests/datasets/TinyGEMMDataset.h"

#include "tests/framework/Asserts.h"

#include "tests/framework/Macros.h"

#include "tests/framework/datasets/Datasets.h"

#include "tests/validation/Validation.h"

#include "tests/validation/fixtures/GEMMFixture.h"

#include "tests/validation/fixtures/GEMMInterleave4x4Fixture.h"

#include "tests/validation/fixtures/GEMMTranspose1xWFixture.h"


namespace arm_compute

{

namespace test

{

namespace validation

{

namespace

{

constexpr AbsoluteTolerance<float> tolerance_f(0.001f); /**< Tolerance value for comparing reference's output against implementation's output for FP32 data types */

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

RelativeTolerance<half_float::half> rel_tolerance_f16(half(0.2)); /**< Relative tolerance value for comparing reference's output against implementation's output for FP16 data types */

const AbsoluteTolerance<float>      abs_tolerance_f16(0.2f);      /**< Absolute tolerance value for comparing reference's output against implementation's output for FP16 data types */

constexpr float                     tolerance_num = 0.07f;        /**< Tolerance number for FP16 data types */

#endif                                                            /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

/** CNN data types */

const auto CNNDataTypes = framework::dataset::make("DataType",

{

#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

    DataType::F16,

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */

    DataType::F32,

});


const auto data_interleave = framework::dataset::make("M", 8, 12) * framework::dataset::make("N", 8, 12);

const auto data_transpose  = framework::dataset::make("M", 8, 14) * framework::dataset::make("N", 7, 14);


/** Zero padding test */

template <typename FunctionType>

bool validate_zero_padding(unsigned int dim0_value, unsigned int dim1_value)

{

    const TensorShape in_shape(dim0_value, dim1_value);

    TensorInfo        in(in_shape, 1, DataType::U32);

    TensorInfo        dst;


    ARM_COMPUTE_EXPECT(in.is_resizable(), framework::LogLevel::ERRORS);


    // Validate zero-padding

    FunctionType func;


    func.configure(&in, &dst);


    return in.padding().empty();

}


/* Zero padding test for GEMM kernels */

bool validate_gemm_zero_padding(const TensorShape shape0, const TensorShape shape1)

{

    // Create tensors

    TensorInfo in0(shape0, 1, DataType::F32);

    TensorInfo in1(shape1, 1, DataType::F32);

    TensorInfo dst;


    // Validate zero-padding

    cpu::kernels::CpuGemmMatrixMultiplyKernel gemm;

    gemm.configure(&in0, &in1, &dst, 1.0, false);


    return in0.padding().empty() && in1.padding().empty() && dst.padding().empty();

}

} // namespace


TEST_SUITE(NEON)

TEST_SUITE(GEMM)


/** Test case for memory injection in @ref cpu::CpuGemm.

 *

 * Configure the operator once and inject memory at run-time in multiple executions.

 *

 * Checks performed in order:

 * - Both runs compute the same output

 */

TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)

{

    auto       gemm      = std::make_unique<cpu::CpuGemm>();

    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);

    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);

    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);

    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);

    const auto gemm_info = GEMMInfo{};

    gemm->configure(&lhs_info, &rhs_info, &c_info, &dst_info, 1.f, 1.f, gemm_info);


    // telhs are newly created every call of this lambda function

    auto lhs = create_tensor<Tensor>(lhs_info);

    auto rhs = create_tensor<Tensor>(rhs_info);

    auto c   = create_tensor<Tensor>(c_info);

    lhs.allocator()->allocate();

    rhs.allocator()->allocate();

    c.allocator()->allocate();


    ITensorPack run_pack{ { TensorType::ACL_SRC_0, &lhs }, { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };

    ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &rhs }, { TensorType::ACL_SRC_2, &c } };


    auto mg = MemoryGroup{};

    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);


    auto run_conv = [&]() -> Tensor

    {

        auto dst = create_tensor<Tensor>(dst_info);

        dst.allocator()->allocate();

        run_pack.add_tensor(TensorType::ACL_DST, &dst);


        library->fill_tensor_value(Accessor(lhs), 1.f);

        library->fill_tensor_value(Accessor(rhs), 2.f);

        library->fill_tensor_value(Accessor(c), 3.f);

        // This operator is configured once and captured by this lambda.

        gemm->prepare(prep_pack);

        gemm->run(run_pack);

        return dst;

    };

    auto result_0 = run_conv();

    auto result_1 = run_conv();

    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)

    {

        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);

    }

}


/** Test case for memory injection in @ref NEGEMM.

 *

 * Make sure @ref NEGEMM still works through injecting the memory at configure time using the old API.

 *

 * Checks performed in order:

 * - Both runs compute the same output

 */

TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)

{

    auto       gemm      = std::make_unique<NEGEMM>();

    const auto lhs_info  = TensorInfo(TensorShape(3U, 3U), 1, DataType::F32);

    const auto rhs_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);

    const auto c_info    = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);

    auto       dst_info  = TensorInfo(TensorShape(4U, 3U), 1, DataType::F32);

    const auto gemm_info = GEMMInfo{};

    auto       run_conv  = [&]()

    {

        auto lhs = create_tensor<Tensor>(lhs_info);

        auto rhs = create_tensor<Tensor>(rhs_info);

        auto c   = create_tensor<Tensor>(c_info);

        auto dst = create_tensor<Tensor>(dst_info);

        gemm->configure(&lhs, &rhs, &c, &dst, 1.f, 1.f, gemm_info);

        lhs.allocator()->allocate();

        rhs.allocator()->allocate();

        c.allocator()->allocate();

        dst.allocator()->allocate();

        library->fill_tensor_value(Accessor(lhs), 1.f);

        library->fill_tensor_value(Accessor(rhs), 2.f);

        library->fill_tensor_value(Accessor(c), 3.f);

        gemm->run();

        return dst;

    };

    auto result_0 = run_conv();

    auto result_1 = run_conv();

    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)

    {

        ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);

    }

}


// *INDENT-OFF*

// clang-format off

DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(

               framework::dataset::make("LhsInfo", { TensorInfo(TensorShape(27U, 13U), 1, DataType::S32), // Unsupported data type

                                                       TensorInfo(TensorShape(27U, 13U), 1, DataType::F32),

                                                     }),

               framework::dataset::make("RhsInfo",{ TensorInfo(TensorShape(8U, 27U), 1, DataType::S32),

                                                        TensorInfo(TensorShape(8U, 27U), 1, DataType::F32),

                                                     })),

               framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(8U, 13U), 1, DataType::S32),

                                                        TensorInfo(TensorShape(8U, 13U), 1, DataType::F32),

                                                     })),

               framework::dataset::make("Expected", { false, true })),

               lhs_info, rhs_info, output_info, expected)

{

    constexpr float alpha = 1.0;

    constexpr float beta = 0.0;

    const auto gemm_info = GEMMInfo();

    bool is_valid = bool(NEGEMM::validate(&lhs_info.clone()->set_is_resizable(true), &rhs_info.clone()->set_is_resizable(true), nullptr, &output_info.clone()->set_is_resizable(true), alpha, beta, gemm_info));

    ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS);

}

// clang-format on

// *INDENT-ON*

TEST_SUITE(KERNEL_SELECTION)

DATA_TEST_CASE(KernelSelection_mul_and_add, framework::DatasetMode::ALL,

               combine(framework::dataset::make("CpuExt", std::string("NEON")),

                       framework::dataset::make("DataType", { DataType::F32,

                                                              DataType::F16

                                                            })),

               cpu_ext, data_type)

{

    using namespace cpu::kernels;


    cpuinfo::CpuIsaInfo cpu_isa{};

    cpu_isa.neon = (cpu_ext == "NEON");

    cpu_isa.fp16 = (data_type == DataType::F16);


    const auto *selected_impl_mul = CpuGemmMatrixMultiplyKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);


    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_mul);


    std::string expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_mul";

    std::string actual   = selected_impl_mul->name;


    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);


    const auto *selected_impl_add = CpuGemmMatrixAdditionKernel::get_implementation(DataTypeISASelectorData{ data_type, cpu_isa }, cpu::KernelSelectionType::Preferred);


    ARM_COMPUTE_ERROR_ON_NULLPTR(selected_impl_add);


    expected = lower_string(cpu_ext) + "_" + cpu_impl_dt(data_type) + "_gemm_matrix_add";

    actual   = selected_impl_add->name;


    ARM_COMPUTE_EXPECT_EQUAL(expected, actual, framework::LogLevel::ERRORS);

}

TEST_SUITE_END() // KERNEL_SELECTION


TEST_SUITE(TRANSPOSE_1XW)

using CpuGemmTranspose1xW = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmTranspose1xWKernel>;

DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(

                   framework::dataset::make("N", { 1, 23, 63, 101 }),

                   framework::dataset::make("K", { 1, 47, 29, 27 })),

               n_value, k_value)

{

    bool status = validate_zero_padding<CpuGemmTranspose1xW>(n_value, k_value);

    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);

}


TEST_SUITE(U32)

using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint32_t>;

FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U32))

{

    // Validate output

    validate(Accessor(_target), _reference);

}

TEST_SUITE_END() // U32


TEST_SUITE(U16)

using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint16_t>;

FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U16))

{

    // Validate output

    validate(Accessor(_target), _reference);

}

TEST_SUITE_END() // U16


TEST_SUITE(U8)

using CpuGemmTranspose1xWFixture = GEMMTranspose1xWValidationFixture<Tensor, Accessor, CpuGemmTranspose1xW, uint8_t>;

FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmTranspose1xWFixture, framework::DatasetMode::PRECOMMIT, data_transpose * framework::dataset::make("DataType", DataType::U8))

{

    // Validate output

    validate(Accessor(_target), _reference);

}

TEST_SUITE_END() // U8


TEST_SUITE_END() // TRANSPOSE_1XW


TEST_SUITE(INTERLEAVE_4X4)

using CpuGemmInterleave4x4 = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuGemmInterleave4x4Kernel>;


DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(

                   framework::dataset::make("M", { 1, 23, 63, 101 }),

                   framework::dataset::make("K", { 1, 47, 29, 27 })),

               m_value, k_value)

{

    bool status = validate_zero_padding<cpu::kernels::CpuGemmInterleave4x4Kernel>(m_value, k_value);

    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);

}


TEST_SUITE(U32)

using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint32_t>;

FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U32))

{

    // Validate output

    validate(Accessor(_target), _reference);

}

TEST_SUITE_END() // U32


TEST_SUITE(U16)

using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint16_t>;

FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::U16))

{

    // Validate output

    validate(Accessor(_target), _reference);

}

TEST_SUITE_END() // U16


TEST_SUITE(U8)

using CpuGemmInterleave4x4Fixture = GEMMInterleave4x4ValidationFixture<Tensor, Accessor, CpuGemmInterleave4x4, uint8_t>;

FIXTURE_DATA_TEST_CASE(RunSmall, CpuGemmInterleave4x4Fixture, framework::DatasetMode::PRECOMMIT, data_interleave * framework::dataset::make("DataType", DataType::QASYMM8))

{

    // Validate output

    validate(Accessor(_target), _reference);

}

TEST_SUITE_END() // U8


TEST_SUITE_END() // INTERLEAVE_4X4


template <typename T>

using NEGEMMFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T>;


template <typename T>

using NEBatchedMatMulFixture = GEMMValidationFixture<Tensor, Accessor, NEGEMM, T, true, false, false, false, false, true>;


TEST_SUITE(Float)

DATA_TEST_CASE(ValidateZeroPadding, framework::DatasetMode::ALL, zip(framework::dataset::make("In0", { TensorShape(21U, 13U),

                                                                                                       TensorShape(31U, 1U),

                                                                                                       TensorShape(31U, 1U),

                                                                                                       TensorShape(8U, 2U),

                                                                                                       TensorShape(38U, 12U),

                                                                                                       TensorShape(32U, 1U)

                                                                                                     }),

                                                                     framework::dataset::make("In1", { TensorShape(33U, 21U),

                                                                                                       TensorShape(23U, 31U),

                                                                                                       TensorShape(23U, 31U),

                                                                                                       TensorShape(16U, 8U),

                                                                                                       TensorShape(21U, 38U),

                                                                                                       TensorShape(17U, 32U)

                                                                                                     })),

               shape0, shape1)

{

    bool status = validate_gemm_zero_padding(shape0, shape1);

    ARM_COMPUTE_EXPECT(status, framework::LogLevel::ERRORS);

}


#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

TEST_SUITE(FP16)

FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),

                                                                                                         framework::dataset::make("ReshapeWeights", { true, false })),

                                                                                                 framework::dataset::make("DataType", DataType::F16)))

{

    // Validate output

    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);

}


TEST_SUITE(BATCHED_MATMUL)


FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),

                                                                                                                  framework::dataset::make("ReshapeWeights", { false })),

                                                                                                          framework::dataset::make("DataType", DataType::F16)))

{

    // Validate output

    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);

}

TEST_SUITE_END()


FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),

                                                                                                       framework::dataset::make("ReshapeWeights", { true, false })),


                                                                                               framework::dataset::make("DataType", DataType::F16)))

{

    // Validate output

    validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);

}

TEST_SUITE_END()

#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */


TEST_SUITE(FP32)

FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallGEMMDataset(),

                                                                                                          framework::dataset::make("ReshapeWeights", { true, false })),


                                                                                                  framework::dataset::make("DataType", DataType::F32)))

{

    // Validate output

    validate(Accessor(_target), _reference, tolerance_f);

}

FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(datasets::LargeGEMMDataset(),

                                                                                                        framework::dataset::make("ReshapeWeights", { true, false })),


                                                                                                framework::dataset::make("DataType", DataType::F32)))

{

    // Validate output

    validate(Accessor(_target), _reference, tolerance_f);

}


TEST_SUITE(BATCHED_MATMUL)


TEST_SUITE(FP32)

FIXTURE_DATA_TEST_CASE(RunSmall, NEBatchedMatMulFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(datasets::SmallBatchedMatMulDataset(),

                                                                                                                   framework::dataset::make("ReshapeWeights", { false })),

                                                                                                           framework::dataset::make("DataType", DataType::F32)))

{

    // Validate output

    validate(Accessor(_target), _reference, tolerance_f);

}

TEST_SUITE_END()


TEST_SUITE_END()


TEST_SUITE_END()

TEST_SUITE_END()


TEST_SUITE_END()

TEST_SUITE_END()

} // namespace validation

} // namespace test

} // namespace arm_compute