ComputeLibrary/v21.05/_cpu_depthwise_convolution_assembly_dispatch_8cpp_source.xhtml

 /*
  * Copyright (c) 2019-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */

 #include "src/runtime/cpu/operators/CpuDepthwiseConvolutionAssemblyDispatch.h"

 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/InfoHelpers.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/kernels/assembly/NEDepthwiseConvolutionAssemblyKernelWrapper.h"
 #include "src/core/NEON/kernels/convolution/depthwise/depthwise_dilated.hpp"
 #include "src/core/NEON/kernels/convolution/depthwise/depthwise_quantized_dilated.hpp"
 #include "src/core/helpers/AutoConfiguration.h"

 #include "arm_compute/runtime/NEON/NEScheduler.h"

 #include <set>

 namespace arm_compute
 {
 namespace cpu
 {
 namespace
 {
 std::unique_ptr<depthwise::IDepthwiseConvolution> get_qasymm8_convolver(int kernel_size, int stride_x,
                                                                         int n_batches, int in_rows, int in_cols, int n_channels,
                                                                         int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
                                                                         const qasymm8::QAsymm8Params &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
                                                                         const qasymm8::QAsymm8RescaleParams &rescale_params,
                                                                         int padding_top, int padding_left, int padding_bottom, int padding_right)
 {
     switch(kernel_size)
     {
         case 3:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         case 5:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::QAsymm8DilatedDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         default:
             return nullptr;
     }
 }

 std::unique_ptr<depthwise::IDepthwiseConvolution> get_qsymm8_perchannel_convolver(int kernel_size, int stride_x,
                                                                                   int n_batches, int in_rows, int in_cols, int n_channels,
                                                                                   neon_convolution_kernels::ActivationFunction activation,
                                                                                   const qsymm8::QSymm8PerChannelParams &wqinfo, const qasymm8::QAsymm8Params &iqinfo, const qasymm8::QAsymm8Params &oqinfo,
                                                                                   const qsymm8::QSymm8PerChannelRescaleParams &rescale_params,
                                                                                   int padding_top, int padding_left, int padding_bottom, int padding_right)
 {
     switch(kernel_size)
     {
         case 3:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 1, 1>>(
                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 3, 3, 2, 2>>(
                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         case 5:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 1, 1>>(
                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::QSymm8HybridPerChannelDepthwiseConvolution<2, 2, 5, 5, 2, 2>>(
                                n_batches, in_rows, in_cols, n_channels, activation, wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         default:
             return nullptr;
     }
 }

 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp16_convolver(int kernel_size, int stride_x,
                                                                      int n_batches, int in_rows, int in_cols, int n_channels,
                                                                      int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
                                                                      int padding_top, int padding_left, int padding_bottom, int padding_right)
 {
     switch(kernel_size)
     {
         case 3:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 1, 1, float16_t, float16_t, float16_t>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float16_t, float16_t, float16_t>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         case 5:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 1, 1, float16_t, float16_t, float16_t>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float16_t, float16_t, float16_t>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         default:
             return nullptr;
     }
 }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC

 std::unique_ptr<depthwise::IDepthwiseConvolution> get_fp32_convolver(int kernel_size, int stride_x,
                                                                      int n_batches, int in_rows, int in_cols, int n_channels,
                                                                      int dilation_factor, neon_convolution_kernels::ActivationFunction activation,
                                                                      int padding_top, int padding_left, int padding_bottom, int padding_right)
 {
     switch(kernel_size)
     {
         case 3:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 3, 3, 1, 1, float, float, float>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 3, 3, 2, 2, float, float, float>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         case 5:
         {
             switch(stride_x)
             {
                 case 1:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<4, 4, 5, 5, 1, 1, float, float, float>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 case 2:
                     return std::make_unique<depthwise::DilatedDepthwiseConvolution<3, 3, 5, 5, 2, 2, float, float, float>>(
                                n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
                 default:
                     return nullptr;
             }
         }
         default:
             return nullptr;
     }
 }

 std::unique_ptr<depthwise::IDepthwiseConvolution> create_convolver(const ITensorInfo     *input,
                                                                    const ITensorInfo     *weights,
                                                                    ITensorInfo           *output,
                                                                    const ConvolutionInfo &info)
 {
     const DataType    data_type = input->data_type();
     const TensorShape shape     = input->tensor_shape();

     const int n_batches       = shape[3];
     const int in_rows         = shape.z();
     const int in_cols         = shape.y();
     const int n_channels      = shape.x();
     const int dilation_factor = info.dilation.x();
     const int padding_top     = info.pad_stride_info.pad_top();
     const int padding_left    = info.pad_stride_info.pad_left();
     const int padding_bottom  = info.pad_stride_info.pad_bottom();
     const int padding_right   = info.pad_stride_info.pad_right();

     const bool is_uniform_quantized    = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QASYMM8);
     const bool is_perchannel_quantized = (data_type == DataType::QASYMM8) && (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);

     const unsigned int stride_x    = info.pad_stride_info.stride().first;
     const unsigned int kernel_size = weights->tensor_shape().y();

     // Map activation function
     neon_convolution_kernels::ActivationFunction activation = neon_convolution_kernels::ActivationFunction::None;
     if(arm_compute::utils::info_helpers::is_relu(info.act_info))
     {
         activation = neon_convolution_kernels::ActivationFunction::ReLU;
     }
     else if(arm_compute::utils::info_helpers::is_relu6(info.act_info))
     {
         activation = neon_convolution_kernels::ActivationFunction::ReLU6;
     }

     // Create quantized convolver
     if(is_uniform_quantized)
     {
         const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
         const UniformQuantizationInfo weights_qinfo = weights->quantization_info().uniform();
         const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();

         // Check that quantization info are in the range [0, 255]
         ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
         ARM_COMPUTE_ERROR_ON(weights_qinfo.offset < 0 || weights_qinfo.offset > 255);
         ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
         const qasymm8::QAsymm8Params iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
         const qasymm8::QAsymm8Params wqinfo{ static_cast<uint8_t>(weights_qinfo.offset), weights_qinfo.scale };
         const qasymm8::QAsymm8Params oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };

         // Calculate rescale parameters
         const float fmultipler  = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int32_t     qmultiplier = 0;
         int32_t     qshift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
         qasymm8::QAsymm8RescaleParams rescale_params(qshift, qmultiplier, fmultipler);

         return get_qasymm8_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation,
                                      wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
     }
     else if(is_perchannel_quantized)
     {
         const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
         const QuantizationInfo        weights_qinfo = weights->quantization_info();
         const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();

         // Check that quantization info are in the range [0, 255]
         ARM_COMPUTE_ERROR_ON(input_qinfo.offset < 0 || input_qinfo.offset > 255);
         ARM_COMPUTE_ERROR_ON(output_qinfo.offset < 0 || output_qinfo.offset > 255);
         const qasymm8::QAsymm8Params         iqinfo{ static_cast<uint8_t>(input_qinfo.offset), input_qinfo.scale };
         const qsymm8::QSymm8PerChannelParams wqinfo{ weights_qinfo.scale() };
         const qasymm8::QAsymm8Params         oqinfo{ static_cast<uint8_t>(output_qinfo.offset), output_qinfo.scale };

         // Calculate rescale parameters
         std::vector<float>   fmultipliers;
         std::vector<int32_t> qmultipliers;
         std::vector<int32_t> qshifts;

         for(auto const s : wqinfo.scales)
         {
             const float fmultipler  = iqinfo.scale * s / oqinfo.scale;
             int32_t     qmultiplier = 0;
             int32_t     qshift      = 0;
             quantization::calculate_quantized_multiplier_less_than_one(fmultipler, &qmultiplier, &qshift);
             fmultipliers.push_back(fmultipler);
             qmultipliers.push_back(qmultiplier);
             qshifts.push_back(qshift);
         }

         qsymm8::QSymm8PerChannelRescaleParams rescale_params(qshifts, qmultipliers, fmultipliers);

         return get_qsymm8_perchannel_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, activation,
                                                wqinfo, iqinfo, oqinfo, rescale_params, padding_top, padding_left, padding_bottom, padding_right);
     }
     else
     {
         // Create float convolver
         switch(data_type)
         {
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 return get_fp16_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
             }
 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F32:
             {
                 return get_fp32_convolver(kernel_size, stride_x, n_batches, in_rows, in_cols, n_channels, dilation_factor, activation, padding_top, padding_left, padding_bottom, padding_right);
             }
             default:
                 return nullptr;
         }
     }
 }
 } // namespace

 struct CpuDepthwiseConvolutionAssemblyDispatch::LocalImpl
 {
     std::unique_ptr<depthwise::IDepthwiseConvolution> dwc_assembly_kernel{ nullptr };
     NEDepthwiseConvolutionAssemblyKernelWrapper       dwc_acl_kernel{};
     bool                                              is_prepared{ false };
     experimental::MemoryRequirements                  mem_req{};
 };

 #ifndef DOXYGEN_SKIP_THIS
 CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch()
     : _pImpl(std::make_unique<LocalImpl>())
 {
 }
 #endif /* DOXYGEN_SKIP_THIS */

 CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch() = default;

 void CpuDepthwiseConvolutionAssemblyDispatch::configure(const ITensorInfo     *input,
                                                         const ITensorInfo     *weights,
                                                         const ITensorInfo     *bias,
                                                         ITensorInfo           *output,
                                                         const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
     ARM_COMPUTE_UNUSED(bias);
     ARM_COMPUTE_ERROR_THROW_ON(CpuDepthwiseConvolutionAssemblyDispatch::validate(input,
                                                                                  weights,
                                                                                  bias != nullptr ? bias : nullptr,
                                                                                  output,
                                                                                  info));

     // Output auto inizialitation if not yet initialized
     const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
     auto_init_if_empty(*output, input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_quantization_info(output->quantization_info()));

     _pImpl->is_prepared = false;

     // Create convolver
     _pImpl->dwc_assembly_kernel = create_convolver(input, weights, output, info);
     ARM_COMPUTE_ERROR_ON(_pImpl->dwc_assembly_kernel == nullptr);

     // Create assembly kernel wrapper
     _pImpl->dwc_acl_kernel.configure(_pImpl->dwc_assembly_kernel.get());

     constexpr size_t alignment = 128;

     // Create workspace
     const unsigned int num_threads    = NEScheduler::get().num_threads();
     const size_t       workspace_size = _pImpl->dwc_assembly_kernel->get_working_space_size(num_threads);
     ARM_COMPUTE_ERROR_ON_MSG(workspace_size == 0, "Workspace size cannot be 0 !");
     _pImpl->mem_req.push_back({ TensorType::ACL_INT_0, workspace_size, alignment });

     // Create packing tensor
     const size_t pack_tensor_size = _pImpl->dwc_assembly_kernel->get_packed_params_size();
     ARM_COMPUTE_ERROR_ON_MSG(pack_tensor_size == 0, "Pack tensor size cannot be 0 !");

     _pImpl->mem_req.push_back({ TensorType::ACL_INT_1, pack_tensor_size, alignment });
 }

 experimental::MemoryRequirements CpuDepthwiseConvolutionAssemblyDispatch::workspace() const
 {
     return _pImpl->mem_req;
 }

 Status CpuDepthwiseConvolutionAssemblyDispatch::validate(const ITensorInfo     *input,
                                                          const ITensorInfo     *weights,
                                                          const ITensorInfo     *bias,
                                                          const ITensorInfo     *output,
                                                          const ConvolutionInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F16, DataType::F32);
     if(weights->data_type() != DataType::QSYMM8_PER_CHANNEL)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
     }
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);

     // Validate convolver
     ARM_COMPUTE_RETURN_ERROR_ON(!is_optimized_supported(input, weights, info));

     // Validate activation
     const bool is_relu  = arm_compute::utils::info_helpers::is_relu(info.act_info);
     const bool is_relu6 = arm_compute::utils::info_helpers::is_relu6(info.act_info);
     ARM_COMPUTE_RETURN_ERROR_ON(info.act_info.enabled() && !(is_relu || is_relu6));

     // Check bias
     if(bias != nullptr)
     {
         unsigned int channel_idx = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != weights->dimension(channel_idx));
     }

     // Check output
     if(output->total_size() != 0)
     {
         const TensorShape output_shape = misc::shape_calculator::compute_depthwise_convolution_shape(*input, *weights, info);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }

     // The uniform quantization case will only have 1 scale value in the weights quantization info
     const UniformQuantizationInfo input_qinfo   = input->quantization_info().uniform();
     const QuantizationInfo        weights_qinfo = weights->quantization_info();
     const UniformQuantizationInfo output_qinfo  = output->quantization_info().uniform();
     for(auto const s : weights_qinfo.scale())
     {
         const float fmultipler = input_qinfo.scale * s / output_qinfo.scale;
         ARM_COMPUTE_RETURN_ERROR_ON(fmultipler > 1.f);
     }

     return Status{};
 }

 bool CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported(const ITensorInfo     *input,
                                                                      const ITensorInfo     *weights,
                                                                      const ConvolutionInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);

     // Reshape input shape if in NHWC format
     const DataLayout data_layout = input->data_layout();
     TensorShape      in_shape{ input->tensor_shape() };
     if(data_layout == DataLayout::NHWC)
     {
         in_shape.set(Window::DimX, input->tensor_shape().y());
         in_shape.set(Window::DimY, input->tensor_shape().z());
         in_shape.set(Window::DimZ, input->tensor_shape().x());
     }

     // Check data type
     const DataType input_type            = input->data_type();
     const bool     is_input_type_valid   = is_data_type_float(input_type) || input_type == DataType::QASYMM8;
     const DataType weights_type          = weights->data_type();
     const bool     is_weights_type_valid = is_data_type_float(weights_type) || weights_type == DataType::QASYMM8 || weights_type == DataType::QASYMM8_SIGNED
                                            || weights_type == DataType::QSYMM8_PER_CHANNEL;

     // Check weighs size
     std::set<unsigned int> supported_kernel_sizes = { 3, 5 };
     const unsigned int     width_idx              = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const unsigned int     height_idx             = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int     kernel_w               = weights->dimension(width_idx);
     const unsigned int     kernel_h               = weights->dimension(height_idx);
     bool                   weights_supported      = (kernel_w == kernel_h) && (supported_kernel_sizes.count(kernel_w) != 0);

     // Check for supported strides
     const auto &strides           = info.pad_stride_info.stride();
     bool        supported_strides = (strides.first == strides.second) && ((strides.first == 1) || (strides.first == 2));

     // Check for supported padding
     const auto    pad_top           = info.pad_stride_info.pad_top();
     const auto    pad_right         = info.pad_stride_info.pad_right();
     const auto    pad_bottom        = info.pad_stride_info.pad_bottom();
     const auto    pad_left          = info.pad_stride_info.pad_left();
     PadStrideInfo same_pad          = calculate_same_pad(in_shape, TensorShape(kernel_w, kernel_h), info.pad_stride_info, DataLayout::NCHW, info.dilation);
     bool          is_same_padding   = (pad_top == same_pad.pad_top()) && (pad_right == same_pad.pad_right()) && (pad_bottom == same_pad.pad_bottom()) && (pad_left == same_pad.pad_left());
     bool          is_valid_padding  = (pad_top == 0) && (pad_right == 0) && (pad_bottom == 0) && (pad_left == 0);
     bool          supported_padding = is_same_padding || is_valid_padding;
     // TODO(COMPMID-2464): Enable once dilated conv with stride 2 is supported
     bool is_dilation_supported = ((info.dilation == Size2D(1U, 1U)) || ((info.dilation.x() == info.dilation.y()) && strides.first == 1));

     if(weights_type == DataType::QSYMM8_PER_CHANNEL)
     {
         is_dilation_supported = is_dilation_supported && (info.dilation == Size2D(1U, 1U));
     }

     return is_input_type_valid && is_weights_type_valid && weights_supported && supported_strides && supported_padding && (info.depth_multiplier == 1) && is_dilation_supported;
 }

 void CpuDepthwiseConvolutionAssemblyDispatch::run(ITensorPack &tensors)
 {
     // Prepare assembly kernel
     prepare(tensors);

     auto src       = tensors.get_tensor(TensorType::ACL_SRC_0);
     auto workspace = tensors.get_tensor(TensorType::ACL_INT_0);
     auto dst       = tensors.get_tensor(TensorType::ACL_DST);

     // Setup inputs/outputs
     ARM_COMPUTE_ERROR_ON(workspace == nullptr && workspace->buffer() == nullptr);
     _pImpl->dwc_assembly_kernel->set_working_space(static_cast<void *>(workspace->buffer()));

     ARM_COMPUTE_ERROR_ON(workspace->buffer() == nullptr);
     const int   input_element_size = src->info()->element_size();
     const int   input_batch_stride = src->info()->strides_in_bytes()[3] / input_element_size;
     const int   input_row_stride   = src->info()->strides_in_bytes().z() / input_element_size;
     const int   input_col_stride   = src->info()->strides_in_bytes().y() / input_element_size;
     const void *input_ptr          = src->buffer() + src->info()->offset_first_element_in_bytes();
     _pImpl->dwc_assembly_kernel->set_input(input_ptr, input_batch_stride, input_row_stride, input_col_stride);

     ARM_COMPUTE_ERROR_ON(dst->buffer() == nullptr);
     const int output_element_size = dst->info()->element_size();
     const int output_batch_stride = dst->info()->strides_in_bytes()[3] / output_element_size;
     const int output_row_stride   = dst->info()->strides_in_bytes().z() / output_element_size;
     const int output_col_stride   = dst->info()->strides_in_bytes().y() / output_element_size;
     void     *output_ptr          = dst->buffer() + dst->info()->offset_first_element_in_bytes();
     _pImpl->dwc_assembly_kernel->set_output(output_ptr, output_batch_stride, output_row_stride, output_col_stride);

     // Schedule assembly kernel
     NEScheduler::get().schedule(&_pImpl->dwc_acl_kernel, Window::DimX);
 }

 void CpuDepthwiseConvolutionAssemblyDispatch::prepare(ITensorPack &tensors)
 {
     if(!_pImpl->is_prepared)
     {
         auto weights        = tensors.get_const_tensor(TensorType::ACL_SRC_1);
         auto bias           = tensors.get_const_tensor(TensorType::ACL_SRC_2);
         auto packed_weights = tensors.get_tensor(TensorType::ACL_INT_1);

         ARM_COMPUTE_ERROR_ON(packed_weights->buffer() == nullptr);

         // Pack weights and bias
         const int weights_element_size = weights->info()->element_size();
         const int weights_row_stride   = weights->info()->strides_in_bytes().z() / weights_element_size;
         const int weights_col_stride   = weights->info()->strides_in_bytes().y() / weights_element_size;
         _pImpl->dwc_assembly_kernel->pack_params(packed_weights->buffer(),
                                                  weights->buffer() + weights->info()->offset_first_element_in_bytes(),
                                                  weights_row_stride,
                                                  weights_col_stride,
                                                  (bias != nullptr) ? bias->buffer() : nullptr);
         _pImpl->dwc_assembly_kernel->set_packed_params_buffer(packed_weights->buffer());

         weights->mark_as_unused();
         if(bias != nullptr)
         {
             bias->mark_as_unused();
         }
         _pImpl->is_prepared = true;
     }
 }
 } // namespace cpu
 } // namespace arm_compute
arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::validate
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, const ITensorInfo *output, const ConvolutionInfo &info)
Static function to check if given info will lead to a valid configuration of CpuDepthwiseConvolutionA...
Definition: CpuDepthwiseConvolutionAssemblyDispatch.cpp:394

arm_compute::ITensorInfo::num_dimensions
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)

arm_compute::test::validation::shape
shape
Definition: DFT.cpp:115

arm_compute::ACL_SRC_2
Definition: Types.h:45

arm_compute::TensorShape
Shape of a tensor.
Definition: TensorShape.h:39

arm_compute::misc::shape_calculator::compute_depthwise_convolution_shape
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
Definition: ShapeCalculator.h:420

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::run
void run(ITensorPack &tensors) override
Run the kernels contained in the function.
Definition: CpuDepthwiseConvolutionAssemblyDispatch.cpp:500

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::workspace
experimental::MemoryRequirements workspace() const override
Return the memory requirements required by the workspace.
Definition: CpuDepthwiseConvolutionAssemblyDispatch.cpp:389

ITensor.h

arm_compute::ACL_DST
Definition: Types.h:46

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
Definition: Validate.h:490

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108

arm_compute::ITensorInfo::dimension
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::prepare
void prepare(ITensorPack &tensors) override
Prepare the function for executing.
Definition: CpuDepthwiseConvolutionAssemblyDispatch.cpp:533

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::~CpuDepthwiseConvolutionAssemblyDispatch
~CpuDepthwiseConvolutionAssemblyDispatch()
Default destructor.

arm_compute::ITensorInfo::data_type
virtual DataType data_type() const =0
Data type used for each element of the tensor.

arm_compute::Format::F32
1 channel, 1 F32 per channel

arm_compute::DataLayoutDimension::HEIGHT
height

ARM_COMPUTE_ERROR_ON
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466

arm_compute::test::validation::data_layout
const DataLayout data_layout
Definition: Im2Col.cpp:151

arm_compute::ITensorInfo
Store the tensor's metadata.
Definition: ITensorInfo.h:40

ARM_COMPUTE_ERROR_THROW_ON
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455

arm_compute::UniformQuantizationInfo
Quantization info when assuming per layer quantization.
Definition: QuantizationInfo.h:43

arm_compute::UniformQuantizationInfo::scale
float scale
Definition: QuantizationInfo.h:65

arm_compute::PadStrideInfo::pad_top
unsigned int pad_top() const
Get the top padding.
Definition: Types.h:734

arm_compute::Status
Status class.
Definition: Error.h:52

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::is_optimized_supported
static bool is_optimized_supported(const ITensorInfo *input, const ITensorInfo *weights, const ConvolutionInfo &info)
Check if the optimized kernel can be used for the given kernel sizes and strides.
Definition: CpuDepthwiseConvolutionAssemblyDispatch.cpp:445

ARM_COMPUTE_RETURN_ERROR_ON
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:284

tf_frozen_model_extractor.None
None
Definition: tf_frozen_model_extractor.py:41

arm_compute::test::validation::src
SimpleTensor< float > src
Definition: DFT.cpp:155

arm_compute
Copyright (c) 2017-2021 Arm Limited.
Definition: introduction.dox:24

arm_compute::experimental::MemoryRequirements
std::vector< MemoryInfo > MemoryRequirements
Definition: Types.h:74

arm_compute::Format::F16
1 channel, 1 F16 per channel

arm_compute::test::validation::input
auto input
Definition: LSTMLayerQuantized.cpp:486

Utils.h

arm_compute::test::validation::data_type
const DataType data_type
Definition: Im2Col.cpp:150

arm_compute::ITensorPack::get_const_tensor
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:45

arm_compute::QuantizationInfo
Quantization information.
Definition: QuantizationInfo.h:70

arm_compute::Window::DimX
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43

ARM_COMPUTE_UNUSED
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

arm_compute::test::validation::output_shape
TensorShape output_shape
Definition: LSTMLayerQuantized.cpp:469

arm_compute::ITensorInfo::tensor_shape
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.

arm_compute::DataType::QASYMM8
quantized, asymmetric fixed-point 8-bit number unsigned

ARM_COMPUTE_ERROR_ON_MSG
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456

arm_compute::utils::cast::U
U
Definition: SaturateCast.h:57

arm_compute::QuantizationInfo::uniform
UniformQuantizationInfo uniform() const
Return per layer quantization info.
Definition: QuantizationInfo.h:149

arm_compute::auto_init_if_empty
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: AutoConfiguration.h:42

ShapeCalculator.h

arm_compute::ConvolutionInfo
Definition: Types.h:1864

arm_compute::PadStrideInfo::pad_right
unsigned int pad_right() const
Get the right padding.
Definition: Types.h:729

arm_compute::QuantizationInfo::scale
const std::vector< float > & scale() const
Scale vector accessor.
Definition: QuantizationInfo.h:125

arm_compute::PadStrideInfo
Padding and stride information class.
Definition: Types.h:650

arm_compute::test::validation::dst
auto dst
Definition: DFT.cpp:170

NEScheduler.h

arm_compute::DataLayoutDimension::CHANNEL
channel

arm_compute::ACL_INT_1
Definition: Types.h:52

arm_compute::ITensorInfo::quantization_info
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.

CpuDepthwiseConvolutionAssemblyDispatch.h

arm_compute::ACL_SRC_0
Definition: Types.h:43

arm_compute::DataLayout::NCHW
Num samples, channels, height, width.

arm_compute::ACL_SRC_1
Definition: Types.h:44

arm_compute::utils::info_helpers::is_relu6
bool is_relu6(ActivationLayerInfo activation_info)
Checks if activation information correspond to a relu6 activation function.
Definition: InfoHelpers.h:54

arm_compute::DataType::QSYMM8_PER_CHANNEL
quantized, symmetric per channel fixed-point 8-bit number

InfoHelpers.h

arm_compute::ACL_INT_0
Definition: Types.h:51

arm_compute::Window::DimY
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45

arm_compute::test::validation::info
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)

arm_compute::ITensorPack::get_tensor
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:55

AutoConfiguration.h

arm_compute::ITensorInfo::total_size
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.

arm_compute::IScheduler::schedule
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.

arm_compute::Window::DimZ
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47

arm_compute::Size2D
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541

arm_compute::DataLayout::NHWC
Num samples, height, width, channels.

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788

Validate.h

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::configure
void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *bias, ITensorInfo *output, const ConvolutionInfo &info)
Initialize the function's source, destination, kernels and border_size.
Definition: CpuDepthwiseConvolutionAssemblyDispatch.cpp:347

arm_compute::utils::info_helpers::is_relu
bool is_relu(ActivationLayerInfo activation_info)
Checks if activation information correspond to a relu activation function.
Definition: InfoHelpers.h:43

arm_compute::ITensorPack
Tensor packing service.
Definition: ITensorPack.h:37

arm_compute::DataLayoutDimension::WIDTH
width

ARM_COMPUTE_ERROR_ON_NULLPTR
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157

arm_compute::DataType::QASYMM8_SIGNED
quantized, asymmetric fixed-point 8-bit number signed

arm_compute::IScheduler::num_threads
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.

arm_compute::get_data_layout_dimension_index
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193

AsymmHelpers.h

arm_compute::quantization::calculate_quantized_multiplier_less_than_one
Status calculate_quantized_multiplier_less_than_one(float multiplier, int32_t *quant_multiplier, int32_t *right_shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier with value less than one.
Definition: AsymmHelpers.cpp:53

arm_compute::cpu::CpuDepthwiseConvolutionAssemblyDispatch::CpuDepthwiseConvolutionAssemblyDispatch
CpuDepthwiseConvolutionAssemblyDispatch()

arm_compute::PadStrideInfo::pad_bottom
unsigned int pad_bottom() const
Get the bottom padding.
Definition: Types.h:739

arm_compute::DataType
DataType
Available data types.
Definition: Types.h:77

arm_compute::PadStrideInfo::pad_left
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:724

arm_compute::DataLayout
DataLayout
[DataLayout enum definition]
Definition: Types.h:114

arm_compute::is_data_type_float
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
Definition: Utils.h:947

arm_compute::calculate_same_pad
PadStrideInfo calculate_same_pad(TensorShape input_shape, TensorShape weights_shape, PadStrideInfo conv_info, DataLayout data_layout=DataLayout::NCHW, const Size2D &dilation=Size2D(1u, 1u), const DimensionRoundingType &rounding_type=DimensionRoundingType::FLOOR)
Calculate padding requirements in case of SAME padding.
Definition: Utils.cpp:333

arm_compute::Scheduler::get
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:94