armnn/latest/_batch_mat_mul_impl_8cpp_source.html

 //

 // Copyright © 2022, 2024 Arm Ltd and Contributors. All rights reserved.

 // SPDX-License-Identifier: MIT

 //


 #include "BatchMatMulImpl.hpp"


 #include <armnn/backends/WorkloadData.hpp>

 #include <armnn/Logging.hpp>

 #include <armnnUtils/Permute.hpp>


 namespace armnn

 {


 BatchMatMul::BatchMatMul(const BatchMatMulDescriptor& params,

                          const TensorInfo& inputXInfo,

                          const TensorInfo& inputYInfo,

                          const TensorInfo& outputInfo,

                          Decoder<float>& inputXDecoder,

                          Decoder<float>& inputYDecoder,

                          Encoder<float>& outputEncoder)

     : params(params),

       inputXInfo(inputXInfo),

       inputYInfo(inputYInfo),

       outputInfo(outputInfo),

       inputXDecoder(inputXDecoder),

       inputYDecoder(inputYDecoder),

       outputEncoder(outputEncoder)

 {

     inputXData = this->inputXDecoder.DecodeTensor(inputXInfo.GetShape());

     inputYData = this->inputYDecoder.DecodeTensor(inputYInfo.GetShape());

     // At this point, we don't touch the input decoders - just the resultant vectors


     ApplyParams();


     ApplyBatchMatMul();

 }


 void BatchMatMul::ApplyBatchMatMul()

 {

     auto axesXToMul = BatchMatMulDescriptor::GetAxesToMul(params.m_DataLayoutX,

                                                           inputXInfo.GetShape());

     auto axesYToMul = BatchMatMulDescriptor::GetAxesToMul(params.m_DataLayoutY,

                                                           inputYInfo.GetShape());


     // the inputYRowSize (or inputXColSize) needs to be obtained using the original (unadjusted) axis value,

     // because it's obtained from the original tensor shape

     unsigned int inputYRowSize = inputYInfo.GetShape()[axesYToMul.first];


     AdjustAxesToMulForUnequalRanks(axesXToMul, axesYToMul);


     unsigned int inputXColDim = axesXToMul.second;

     unsigned int inputYRowDim = axesYToMul.first;


     auto batchMatMulOperation = [&](const std::vector<unsigned int>& curIdx)

     {

         float sum = 0.0f;


         // InputYRowSize is synonymous with inputXColSize

         for (unsigned int inputYRowIdx = 0; inputYRowIdx < inputYRowSize; inputYRowIdx++) {

             auto xIdx = curIdx;

             xIdx[inputXColDim] = inputYRowIdx;


             auto yIdx = curIdx;

             yIdx[inputYRowDim] = inputYRowIdx;


             sum += (GetValueAt(DataSlot::InputX, xIdx) * GetValueAt(DataSlot::InputY, yIdx));

         }


         SetValueAt(sum, DataSlot::Output, curIdx);

     };


     auto startIdx = std::vector<unsigned int>(outputInfo.GetNumDimensions(), 0);

     RecurseTensor(outputInfo,

                   batchMatMulOperation,

                   startIdx,

                   0);

 }


 void BatchMatMul::ApplyParams()

 {

     if(params.m_TransposeX)

     {

         Transpose(DataSlot::InputX);

     }

     else if(params.m_AdjointX)

     {

         Adjoint(DataSlot::InputX);

     }

     if(params.m_TransposeY)

     {

         Transpose(DataSlot::InputY);

     }

     else if(params.m_AdjointY)

     {

         Adjoint(DataSlot::InputY);

     }

 }


 void BatchMatMul::Transpose(DataSlot type)

 {

     // AKA the permute of the tensor

     // This modifies the tensor's info.


     switch(type)

     {

         case DataSlot::InputX:

         {

             auto permuteVec = BatchMatMulDescriptor::GetPermuteVec(params.m_DataLayoutX,

                                                                    inputXInfo.GetShape());

             inputXInfo = armnnUtils::Permuted(inputXInfo, permuteVec);

             std::vector<float> temp(inputXData.size());

             armnnUtils::Permute(inputXInfo.GetShape(),

                                 permuteVec,

                                 inputXData.data(),

                                 temp.data(),

                                 sizeof(float));

             inputXData = temp;

             break;

         }

         case DataSlot::InputY:

         {

             auto permuteVec = BatchMatMulDescriptor::GetPermuteVec(params.m_DataLayoutY,

                                                                    inputYInfo.GetShape());

             inputYInfo = armnnUtils::Permuted(inputYInfo, permuteVec);

             std::vector<float> temp(inputYData.size());

             armnnUtils::Permute(inputYInfo.GetShape(),

                                 permuteVec,

                                 inputYData.data(),

                                 temp.data(),

                                 sizeof(float));

             inputYData = temp;

             break;

         }

         case DataSlot::Output: // We needn't transpose the output tensor

         default:

             break;

     }

 }


 void BatchMatMul::Adjoint(DataSlot type)

 {

     // Finding the adjoint of a square matrix:

     // Calculate the cofactor of each element (using Gauss elimination here)

     // Apply a transpose to it (this also modifies the tensor's info)


     TensorInfo& inputInfo = (type == DataSlot::InputX) ? inputXInfo : inputYInfo;

     const auto& dataLayout = (type == DataSlot::InputX) ? params.m_DataLayoutX : params.m_DataLayoutY;

     const auto axesToAdjoint = BatchMatMulDescriptor::GetAxesToMul(dataLayout,inputInfo.GetShape());


     // We grab a copy of the tensor data to prevent overwriting

     std::vector<float> inputDataClone = (type == DataSlot::InputX) ? inputXData : inputYData;


     // The sub-matrix is the resultant matrix when the row and column of the current index is removed

     unsigned int subMatAxisSize = inputInfo.GetShape()[axesToAdjoint.first] - 1;

     std::vector<std::vector<float>> subMat(subMatAxisSize,

                                            std::vector<float>(subMatAxisSize));


     // Lambdas for each sub-step of the cofactor operation

     auto almostEquals = [&](const float& a, const float& b, float unitsInLastPlace = 2.0f)

     {

         float diff = std::fabs(a-b);

         float bound = diff * std::numeric_limits<float>::epsilon() * unitsInLastPlace;

         return (diff <= bound) || (diff < std::numeric_limits<float>::min());

     };


     float swapMultiplier = std::numeric_limits<float>::max();

     auto swapRows = [&](unsigned int rowIdxA, unsigned int rowIdxB)

     {

         // Every row swap flips this around by the negative (set to 1 at the beginning of each cofactor op run)

         for(unsigned int colIdx = 0; colIdx < subMatAxisSize; colIdx++)

         {

             float tmp = subMat[rowIdxA][colIdx];

             subMat[rowIdxA][colIdx] = subMat[rowIdxB][colIdx];

             subMat[rowIdxB][colIdx] = tmp;

         }

         swapMultiplier *= -1.0f;

     };


     auto findNextValidPivotRowIdx = [&](unsigned int colIdx)

     {

         unsigned int result = std::numeric_limits<unsigned int>::max();


         // The original diagonal has been checked and is invalid

         for(unsigned int rowIdx = colIdx+1; rowIdx < subMatAxisSize; rowIdx++)

         {

             if(!almostEquals(subMat[rowIdx][colIdx], 0.0f))

             {

                 result = rowIdx;

                 break;

             }

         }

         return result;

     };


     auto eliminate = [&](const float& pivot, unsigned int pivotPos)

     {

         for(unsigned int rowIdx = pivotPos+1; rowIdx < subMatAxisSize; rowIdx++)

         {

             float multiplierNumerator = subMat[rowIdx][pivotPos];

             if(almostEquals(multiplierNumerator, 0.0f))

             {

                 continue;

             }

             float multiplier = multiplierNumerator / pivot; // Susceptible to floating point inaccuracies

                                                             // Hence the almostEquals usage to counteract this

             for(unsigned int colIdx = pivotPos; colIdx < subMatAxisSize; colIdx++)

             {

                 // We start at col=pivotPos as we have assumed that all elements

                 // to our left have been eliminated to zero already


                 // We subtract based on the element directly above us in our pivot row

                 subMat[rowIdx][colIdx] -= multiplier * subMat[pivotPos][colIdx];

             }

         }

     };


     auto cofactorOperation = [&](const std::vector<unsigned int>& curIdx)

     {

         auto row = curIdx[axesToAdjoint.first];

         auto col = curIdx[axesToAdjoint.second];


         float minorMultiplier = static_cast<float>(std::pow(-1, (row + 1 + col + 1)));


         for(unsigned int subRow = 0; subRow < subMatAxisSize; subRow++)

         {

             for(unsigned int subCol = 0; subCol < subMatAxisSize; subCol++)

             {

                 unsigned int outerRow = (subRow >= row)?subRow + 1:subRow;

                 unsigned int outerCol = (subCol >= col)?subCol + 1:subCol;

                 auto cloneIdx = curIdx;

                 cloneIdx[axesToAdjoint.first] = outerRow;

                 cloneIdx[axesToAdjoint.second] = outerCol;

                 subMat[subRow][subCol] = GetValueAt(type,cloneIdx,inputDataClone);

             }

         }


         float determinant = 1.0f;


         // Cover the edge cases and simple base cases before resorting to Gauss elimination for larger matrices

         switch(subMatAxisSize)

         {

             case 0:

             {

                 determinant = GetValueAt(type, curIdx, inputDataClone);

                 break;

             }

             case 1:

             {

                 // If the resultant sub-matrix is just one element - that's the determinant

                 determinant = subMat[0][0];

                 break;

             }

             case 2:

             {

                 // For a 2x2 sub-matrix, the determinant is just a*d-b*c

                 determinant = subMat[0][0] * subMat[1][1] -

                               subMat[0][1] * subMat[1][0];

                 break;

             }

             default:

             {

                 // Gaussian elimination to find the determinant of this sub-matrix

                 swapMultiplier = 1.0f;

                 // March diagonally down the pivots and if it's invalid (a zero), swap the row with the

                 // nearest non-zero down within the column

                 for(unsigned int pivotRow = 0, pivotCol = 0;

                     pivotRow < subMatAxisSize;

                     pivotRow++, pivotCol++)

                 {

                     float& pivot = subMat[pivotRow][pivotCol];


                     if(almostEquals(pivot, 0.0f))

                     {

                         unsigned int nextValidPivotRowIdx = findNextValidPivotRowIdx(pivotCol);

                         if(nextValidPivotRowIdx == std::numeric_limits<unsigned int>::max())

                         {

                             // No valid pivot down this column, which means that this pivot remains a zero.

                             // This results in the determinant for this entire sub-matrix to just be zero.

                             determinant = 0.0f;

                             break;

                         }

                         swapRows(pivotRow, nextValidPivotRowIdx);

                     }

                     determinant *= pivot;

                     // The actual elimination bit (which will update/propagate to the pivots down the line)

                     eliminate(pivot, pivotRow); // Synonymous with pivotCol

                 }


                 determinant *= swapMultiplier;

                 break;

             }

         }

         float cofactor = minorMultiplier * determinant;

         SetValueAt(cofactor, type, curIdx);

     };


     auto startIdx = std::vector<unsigned int>(inputInfo.GetNumDimensions(), 0);

     RecurseTensor(inputInfo,

                   cofactorOperation,

                   startIdx,

                   0);


     Transpose(type);

 }


 void BatchMatMul::RecurseTensor(const TensorInfo& tensorInfo,

                                 const std::function<void(const std::vector<unsigned int>&)>& operation,

                                 std::vector<unsigned int>& curIdx,

                                 unsigned int curDim)

 {

     if(!(curDim < tensorInfo.GetNumDimensions()))

     {

         // We're at the leaf level of this call tree, so we operate here (each leaf is a data point)

         operation(curIdx);

         return;

     }


     for(unsigned int i = 0; i < tensorInfo.GetShape()[curDim]; i++)

     {

         curIdx[curDim] = i;

         RecurseTensor(tensorInfo,

                       operation,

                       curIdx,

                       curDim + 1);

     }

 }


 void BatchMatMul::AdjustAxesToMulForUnequalRanks(std::pair<unsigned int, unsigned int>& axesXToMul,

                                                  std::pair<unsigned int, unsigned int>& axesYToMul)

 {

     int rankDiff = static_cast<int>(inputXInfo.GetNumDimensions()) -

                    static_cast<int>(inputYInfo.GetNumDimensions());

     if(rankDiff == 0)

     {

         return;

     }

     else if(rankDiff < 0)

     {

         // Y is the larger one

         axesXToMul.first += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));

         axesXToMul.second += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));

     }

     else if(rankDiff > 0)

     {

         // X is the larger one

         axesYToMul.first += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));

         axesYToMul.second += static_cast<std::make_unsigned<unsigned int>::type>(std::abs(rankDiff));

     }

 }


 float BatchMatMul::GetValueAt(DataSlot type, std::vector<unsigned int> idx, const std::vector<float>& customData)

 {

     // This gets the data from the input vector that we have, Not the decoder

     // But for the output, it is operating on the encoder itself


     AdjustToSafeIdx(type, idx);

     unsigned int flatIdx = CalcFlatIdx(type, idx);

     float value = 0.0f;

     switch(type)

     {

         case DataSlot::InputX:

             value = customData.empty() ? inputXData[flatIdx] : customData[flatIdx];

             break;

         case DataSlot::InputY:

             value = customData.empty() ? inputYData[flatIdx] : customData[flatIdx];

             break;

         case DataSlot::Output:

             outputEncoder[flatIdx];

             value = outputEncoder.Get();

             break;

         default:

             break;

     }


     return value;

 }


 void BatchMatMul::SetValueAt(float value, DataSlot type, std::vector<unsigned int> idx)

 {

     AdjustToSafeIdx(type, idx);

     unsigned int flatIdx = CalcFlatIdx(type, idx);

     switch(type)

     {

         case DataSlot::InputX:

             inputXData[flatIdx] = value;

             break;

         case DataSlot::InputY:

             inputYData[flatIdx] = value;

             break;

         case DataSlot::Output:

             outputEncoder[flatIdx];

             outputEncoder.Set(value);

             break;

         default:

             break;

     }

 }


 void BatchMatMul::AdjustToSafeIdx(DataSlot type, std::vector<unsigned int>& idx)

 {

     for(unsigned int dim = 0; dim < idx.size(); dim++)

     {

         switch(type)

         {

             case DataSlot::InputX:

             {

                 auto xRank = inputXInfo.GetNumDimensions();

                 auto xDiff = outputInfo.GetNumDimensions() - xRank;

                 if (dim < xDiff ||

                     idx[dim] > inputXInfo.GetShape()[dim-xDiff]-1)

                 {

                     idx[dim] = 0; // Broadcasting

                 }

                 break;

             }

             case DataSlot::InputY:

             {

                 auto yRank = inputYInfo.GetNumDimensions();

                 auto yDiff = outputInfo.GetNumDimensions() - yRank;

                 if (dim < yDiff ||

                     idx[dim] > inputYInfo.GetShape()[dim-yDiff]-1)

                 {

                     idx[dim] = 0;

                 }

                 break;

             }

             case DataSlot::Output:

             {

                 // Our indices are based off the output

                 break;

             }

             default:

                 break;

         }

     }

 }


 unsigned int BatchMatMul::CalcFlatIdx(DataSlot type, const std::vector<unsigned int>& idx)

 {

     unsigned int result = idx[idx.size()-1];

     unsigned int dimMultiplier = 1;

     unsigned int offset = 0;


     // -2 because final dim is already accounted for in the multiplier (last dim is just a multiplier of 1x)

     // Check offset in relation to i, to stop calculating flat index once all input shape fields considered

     for(unsigned int i = static_cast<unsigned int>(idx.size()-2); static_cast<int>(i) >= 0 && (i + 1) > offset; i--)

     {

         switch(type)

         {

             case DataSlot::InputX:

                 offset = outputInfo.GetNumDimensions() - inputXInfo.GetNumDimensions();

                 dimMultiplier *= inputXInfo.GetShape()[i + 1 - offset];

                 break;

             case DataSlot::InputY:

                 offset = outputInfo.GetNumDimensions() - inputYInfo.GetNumDimensions();

                 dimMultiplier *= inputYInfo.GetShape()[i + 1 - offset];

                 break;

             case DataSlot::Output:

                 dimMultiplier *= outputInfo.GetShape()[i+1];

                 break;

             default:

                 break;

         }

         result += (idx[i] * dimMultiplier);

     }

     return result;

 }


 } // namespace armnn

BatchMatMulImpl.hpp

Logging.hpp

Permute.hpp

WorkloadData.hpp

armnn::BatchMatMul::BatchMatMul
BatchMatMul(const BatchMatMulDescriptor &params, const TensorInfo &inputXInfo, const TensorInfo &inputYInfo, const TensorInfo &outputInfo, Decoder< float > &inputXDecoder, Decoder< float > &inputYDecoder, Encoder< float > &outputEncoder)
Definition: BatchMatMulImpl.cpp:15

armnn::Decoder< float >

armnn::Decoder::DecodeTensor
virtual std::vector< float > DecodeTensor(const TensorShape &tensorShape, bool isDepthwise=false)=0

armnn::Encoder< float >

armnn::Encoder::Get
virtual IType Get() const =0

armnn::Encoder::Set
virtual void Set(IType right)=0

armnn::TensorInfo
Definition: Tensor.hpp:153

armnn::TensorInfo::GetNumDimensions
unsigned int GetNumDimensions() const
Definition: Tensor.hpp:197

armnn::TensorInfo::GetShape
const TensorShape & GetShape() const
Definition: Tensor.hpp:193

armnn
Copyright (c) 2021 ARM Limited and Contributors.
Definition: 01_00_quick_start.dox:7

armnnUtils::Permuted
armnn::TensorShape Permuted(const armnn::TensorShape &srcShape, const armnn::PermutationVector &mappings)
Definition: Permute.cpp:125

armnnUtils::Permute
void Permute(const armnn::TensorShape &dstShape, const armnn::PermutationVector &mappings, const void *src, void *dst, size_t dataTypeSize)
Definition: Permute.cpp:164

armnn::BatchMatMulDescriptor
A BatchMatMulDescriptor for the BatchMatMul operator.
Definition: Descriptors.hpp:1585

armnn::BatchMatMulDescriptor::m_AdjointX
bool m_AdjointX
Adjoint the slices of each input tensor Transpose and Adjoint can not both be set to true for the sam...
Definition: Descriptors.hpp:1617

armnn::BatchMatMulDescriptor::m_TransposeY
bool m_TransposeY
Definition: Descriptors.hpp:1613

armnn::BatchMatMulDescriptor::GetAxesToMul
static std::pair< unsigned int, unsigned int > GetAxesToMul(DataLayout dataLayout, const TensorShape &tensorShape)
Static helper to get the two axes (for each input) for multiplication.
Definition: Descriptors.cpp:485

armnn::BatchMatMulDescriptor::GetPermuteVec
static PermutationVector GetPermuteVec(DataLayout dataLayout, const TensorShape &tensorShape)
Static helper to get the axes which will be transposed.
Definition: Descriptors.cpp:523

armnn::BatchMatMulDescriptor::m_DataLayoutY
DataLayout m_DataLayoutY
Definition: Descriptors.hpp:1622

armnn::BatchMatMulDescriptor::m_TransposeX
bool m_TransposeX
Transpose the slices of each input tensor Transpose and Adjoint can not both be set to true for the s...
Definition: Descriptors.hpp:1612

armnn::BatchMatMulDescriptor::m_AdjointY
bool m_AdjointY
Definition: Descriptors.hpp:1618

armnn::BatchMatMulDescriptor::m_DataLayoutX
DataLayout m_DataLayoutX
Data layout of each input tensor, such as NHWC/NDHWC (leave as default for arbitrary layout)
Definition: Descriptors.hpp:1621