ComputeLibrary/latest/_n_e_o_n_2_g_e_m_m_lowp_8cpp_source.xhtml

/*

 * Copyright (c) 2017-2024 Arm Limited.

 *

 * SPDX-License-Identifier: MIT

 *

 * Permission is hereby granted, free of charge, to any person obtaining a copy

 * of this software and associated documentation files (the "Software"), to

 * deal in the Software without restriction, including without limitation the

 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

 * sell copies of the Software, and to permit persons to whom the Software is

 * furnished to do so, subject to the following conditions:

 *

 * The above copyright notice and this permission notice shall be included in all

 * copies or substantial portions of the Software.

 *

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 * SOFTWARE.

 */

#include "arm_compute/core/Types.h"

#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"

#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"

#include "arm_compute/runtime/Tensor.h"

#include "arm_compute/runtime/TensorAllocator.h"

#include "src/core/helpers/MemoryHelpers.h"

#include "src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"

#include "tests/NEON/Accessor.h"

#include "tests/NEON/Helper.h"

#include "tests/PaddingCalculator.h"

#include "tests/datasets/GEMMLowpFusedOffsetOutputDataset.h"

#include "tests/datasets/LargeGEMMLowpDataset.h"

#include "tests/datasets/ShapeDatasets.h"

#include "tests/datasets/SmallGEMMLowpDataset.h"

#include "tests/framework/Asserts.h"

#include "tests/framework/Macros.h"

#include "tests/framework/datasets/Datasets.h"

#include "tests/validation/Validation.h"

#include "tests/validation/fixtures/GEMMLowpFixture.h"


namespace arm_compute

{

namespace test

{

namespace validation

{

TEST_SUITE(NEON)

TEST_SUITE(GEMMLowp)

TEST_SUITE(MatrixMultiplyCore)


using NEGEMMLowpMatrixMultiplyCoreFixture = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;

using NEGEMMLowpBatchedMatMulFixture      = GEMMLowpMatrixMultiplyCoreValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, true>;


using framework::dataset::make;


DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, framework::dataset::concat(datasets::SmallGEMMLowpDataset(), datasets::LargeGEMMLowpDataset()),

               shape_a, shape_b, shape_c, a_offset, b_offset)

{

    // Create tensors

    Tensor a = create_tensor<Tensor>(shape_a, DataType::QASYMM8);

    Tensor b = create_tensor<Tensor>(shape_b, DataType::QASYMM8);

    Tensor c = create_tensor<Tensor>(shape_c, DataType::S32);


    a.info()->set_quantization_info(QuantizationInfo(1.0f / 255, a_offset));

    b.info()->set_quantization_info(QuantizationInfo(1.0f / 255, b_offset));


    ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);

    ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);

    ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);


    // Create and configure function

    NEGEMMLowpMatrixMultiplyCore gemmlowp_mm;

    gemmlowp_mm.configure(&a, &b, nullptr, &c);


    // Validate padding is zero

    validate(a.info()->padding(), PaddingSize());

    validate(b.info()->padding(), PaddingSize());

    validate(c.info()->padding(), PaddingSize());

}


// *INDENT-OFF*

// clang-format off

DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(

    make("InputAInfo", { TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Input not a multiple of 4

                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::S32),                                 // Mismatching data type

                                             TensorInfo(TensorShape(20U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions

                                             TensorInfo(TensorShape(21U, 13U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)), // Invalid dimensions

                                             TensorInfo(TensorShape(16U, 32U), 1, DataType::QASYMM8, QuantizationInfo(1.f/255, 10)),

                                          }),

    make("InputBInfo",{ TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),

                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),

                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),

                                            TensorInfo(TensorShape(33U, 21U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),

                                            TensorInfo(TensorShape(64U, 16U), 1, DataType::QASYMM8, QuantizationInfo(1.f/256, 10)),

                                          }),

    make("OutputInfo",{ TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),

                                            TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),

                                            TensorInfo(TensorShape(33U, 13U), 1, DataType::S32),

                                            TensorInfo(TensorShape(8U, 11U), 1, DataType::S32),

                                            TensorInfo(TensorShape(64U, 32U), 1, DataType::S32),

                                           }),

    make("Expected", { true, false, false, false, true })),

    a_info, b_info, output_info, expected)

{

    // Lock tensors

    Status status =  NEGEMMLowpMatrixMultiplyCore::validate(&a_info.clone()->set_is_resizable(false),

                                                            &b_info.clone()->set_is_resizable(false),

                                                            nullptr,

                                                            &output_info.clone()->set_is_resizable(false));

    ARM_COMPUTE_EXPECT(bool(status) == expected, framework::LogLevel::ERRORS);

}

// clang-format on

// *INDENT-ON*


/** Test case for memory injection in @ref cpu::CpuGemmLowpMatrixMultiplyCore.

 *

 * Configure the operator once and inject memory at run-time in multiple executions.

 *

 * Checks performed in order:

 * - Both runs compute the same output

 */

TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)

{

    auto gemm     = std::make_unique<cpu::CpuGemmLowpMatrixMultiplyCore>();

    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);

    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);

    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);

    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));

    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));

    const auto gemm_info = GEMMInfo{};

    gemm->configure(&a_info, &b_info, nullptr, &dst_info, gemm_info);


    // telhs are newly created every call of this lambda function

    auto a   = create_tensor<Tensor>(a_info);

    auto b   = create_tensor<Tensor>(b_info);

    auto dst = create_tensor<Tensor>(dst_info);

    a.allocator()->allocate();

    b.allocator()->allocate();

    dst.allocator()->allocate();


    ITensorPack run_pack =

    {

        { TensorType::ACL_SRC_0, &a },

        { TensorType::ACL_SRC_1, &b },

        { TensorType::ACL_DST, &dst }

    };

    ITensorPack prep_pack =

    {

        { TensorType::ACL_SRC_1, &b },

    };


    auto mg = MemoryGroup{};

    auto ws = manage_workspace<Tensor>(gemm->workspace(), mg, run_pack, prep_pack);


    auto run_conv = [&]() -> Tensor

    {

        auto dst = create_tensor<Tensor>(dst_info);

        dst.allocator()->allocate();

        run_pack.add_tensor(TensorType::ACL_DST, &dst);


        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));

        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));

        // This operator is configured once and captured by this lambda.

        gemm->prepare(prep_pack);

        gemm->run(run_pack);

        return dst;

    };

    auto result_0 = run_conv();

    auto result_1 = run_conv();

    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)

    {

        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);

    }

}


/** Test case for memory injection in @ref NEGEMMLowpMatrixMultiplyCore.

 *

 * Make sure @ref NEGEMMLowpMatrixMultiplyCore still works through injecting the memory at configure time using the old API.

 *

 * Checks performed in order:

 * - Both runs compute the same output

 */

TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)

{

    auto gemm     = std::make_unique<NEGEMMLowpMatrixMultiplyCore>();

    auto a_info   = TensorInfo(TensorShape(32U, 72U), 1, DataType::QASYMM8);

    auto b_info   = TensorInfo(TensorShape(17U, 32U), 1, DataType::QASYMM8);

    auto dst_info = TensorInfo(TensorShape(17U, 72U), 1, DataType::S32);

    a_info.set_quantization_info(QuantizationInfo(1.0f / 255, -9));

    b_info.set_quantization_info(QuantizationInfo(1.0f / 255, 1));

    const auto gemm_info = GEMMInfo{};

    auto       run_conv  = [&]()

    {

        auto a   = create_tensor<Tensor>(a_info);

        auto b   = create_tensor<Tensor>(b_info);

        auto dst = create_tensor<Tensor>(dst_info);

        gemm->configure(&a, &b, nullptr, &dst, gemm_info);

        a.allocator()->allocate();

        b.allocator()->allocate();

        dst.allocator()->allocate();

        library->fill_tensor_value(Accessor(a), static_cast<uint8_t>(1));

        library->fill_tensor_value(Accessor(b), static_cast<uint8_t>(2));

        gemm->run();

        return dst;

    };

    auto result_0 = run_conv();

    auto result_1 = run_conv();

    for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)

    {

        ARM_COMPUTE_EXPECT(((uint8_t *)result_0.buffer())[i] == ((uint8_t *)result_1.buffer())[i], framework::LogLevel::ERRORS);

    }

}


FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::ALL, datasets::SmallGEMMLowpDataset())

{

    // Validate output

    validate(Accessor(_target), _reference);

}


FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFixture, framework::DatasetMode::NIGHTLY, datasets::LargeGEMMLowpDataset())

{

    // Validate output

    validate(Accessor(_target), _reference);

}


constexpr AbsoluteTolerance<float> tolerance_batched(1);


using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned =

    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, uint8_t, uint8_t, true>;


TEST_SUITE(BatchedMatMul)

TEST_SUITE(QASYMM8)

FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedUnsigned, framework::DatasetMode::ALL,

    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),

        make("DataType", { DataType::QASYMM8 }),

        make("reshape_b_only_on_first_run", { false })))

{

    validate(Accessor(_target), _reference, tolerance_batched);

}

TEST_SUITE_END() // QASYMM8


using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned =

    GEMMLowpMatrixMultiplyCoreFusedOffsetOutputGenericValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore, false, false, int8_t, int8_t, true>;

TEST_SUITE(QASYMM8_SIGNED)

FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixtureBatchedSigned, framework::DatasetMode::ALL,

    combine(datasets::SmallGEMMLowpFusedBatchedMatMulDataset(),

        make("DataType", { DataType::QASYMM8_SIGNED }),

        make("reshape_b_only_on_first_run", { false })))

{

    validate(Accessor(_target), _reference, tolerance_batched);

}

TEST_SUITE_END() // QASYMM8_SIGNED

TEST_SUITE_END() // BatchedMatMul


using NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture = GEMMLowpMatrixMultiplyCoreFusedOffsetOutputValidationFixture<Tensor, Accessor, NEGEMMLowpMatrixMultiplyCore>;

constexpr AbsoluteTolerance<float> tolerance_quant(1);


TEST_SUITE(FusedOffsetOutput)

FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::ALL,

    combine(datasets::SmallGEMMLowpFusedOffsetOutputUint8Dataset(),

        make("DataType", { DataType::QASYMM8 })))

{

    // Validate output

    validate(Accessor(_target), _reference, tolerance_quant);

}


FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMLowpMatrixMultiplyCoreFusedOffsetOutputFixture, framework::DatasetMode::NIGHTLY,

    combine(datasets::LargeGEMMLowpFusedOffsetOutputUint8Dataset(),

        make("DataType", { DataType::QASYMM8 })))

{

    // Validate output

    validate(Accessor(_target), _reference, tolerance_quant);

}

TEST_SUITE_END() // FusedOffsetOutput

TEST_SUITE_END() // MatrixMultiplyCore

TEST_SUITE_END() // GEMMLowp

TEST_SUITE_END() // NEON

} // namespace validation

} // namespace test

} // namespace arm_compute