ComputeLibrary/v21.02/_n_e_direct_convolution_layer_output_stage_kernel_8cpp_source.xhtml

 /*
  * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.h"

 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/core/Validate.h"
 #include "arm_compute/core/Window.h"
 #include "arm_compute/core/utils/misc/Traits.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CPP/Validate.h"
 #include "src/core/NEON/NEAsymm.h"
 #include "src/core/NEON/NEFixedPoint.h"
 #include "src/core/NEON/wrapper/wrapper.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"

 #include <arm_neon.h>
 #include <cstddef>
 #include <cstdint>

 namespace arm_compute
 {
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
                           const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::S32, DataType::F32);

     if(bias != nullptr)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias);
         ARM_COMPUTE_RETURN_ERROR_ON(bias->dimension(0) != input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)));
         ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1);
     }

     if(input->data_type() == DataType::S32)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(output == nullptr, "In-place computation not allowed for quantized output");
     }

     // Checks performed when output is configured
     if((output != nullptr) && (output->total_size() != 0))
     {
         if(is_data_type_float(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
         }
         else
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
     else if(input->data_type() == DataType::S32)
     {
         // In case of quantized computation and unconfigured output, the output data type must be provided through DirectConvolutionLayerOutputStageKernelInfo
         ARM_COMPUTE_RETURN_ERROR_ON((info.output_data_type != DataType::QASYMM8) && (info.output_data_type != DataType::QASYMM8_SIGNED));
     }

     return Status{};
 }

 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
 output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     /** Neon vector tag type. */
     using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;

     ARM_COMPUTE_ERROR_ON(input->info()->data_layout() == DataLayout::UNKNOWN);
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
     ARM_COMPUTE_UNUSED(result_shift);
     ARM_COMPUTE_UNUSED(result_offset_after_shift);

     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
     const int window_step_x  = 16 / input->info()->element_size();
     Window    win            = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));

     Iterator in(input, win);
     Iterator out(output, win);
     execute_window_loop(win, [&](const Coordinates & id)
     {
         int x = window_start_x;
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
             auto       v_in   = wrapper::vloadq(in_ptr);

             // Accumulate bias
             if(has_bias)
             {
                 const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
                 v_in          = wrapper::vadd(v_in, vb);
             }

             const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
             wrapper::vstore(out_ptr, v_in);
         }

         // Left-overs loop
         for(; x < window_end_x; ++x)
         {
             // Get bias and pointer to input
             auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);

             // Accumulate bias
             if(has_bias)
             {
                 const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
                 s_in += b;
             }

             *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
         }

     },
     in, out);
 }

 template <typename T>
 typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
 output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                   int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     ARM_COMPUTE_UNUSED(result_fixedpoint_multiplier);
     ARM_COMPUTE_UNUSED(result_shift);
     ARM_COMPUTE_UNUSED(result_offset_after_shift);

     Window window_bias = window;
     window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
     window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
     window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
     window_bias.set(3, Window::Dimension(0, 0, 0));

     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
     const int window_step_x  = 16 / input->info()->element_size();
     Window    win            = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));

     Iterator in(input, win);
     Iterator bi(bias, window_bias);
     Iterator out(output, win);

     execute_window_loop(win, [&](const Coordinates &)
     {
         int x = window_start_x;
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
             auto       v_in   = wrapper::vloadq(in_ptr + x);

             // Accumulate bias
             if(has_bias)
             {
                 const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
                 v_in                = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
             }

             const auto out_ptr = reinterpret_cast<T *>(out.ptr());
             wrapper::vstore(out_ptr + x, v_in);
         }

         // Left-overs loop
         for(; x < window_end_x; ++x)
         {
             // Get bias and pointer to input
             auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);

             // Accumulate bias
             if(has_bias)
             {
                 const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
                 s_in += *bias_ptr;
             }

             const auto out_ptr = reinterpret_cast<T *>(out.ptr());
             *(out_ptr + x)     = s_in;
         }
     },
     in, bi, out);
 }

 // Quantized case
 template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
 void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                        int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
     using TagType    = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;

     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);

     const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
     const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});

     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
     const int window_step_x  = 16 / input->info()->element_size();
     Window    win            = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));

     Iterator in(input, win);
     Iterator out(output, win);

     execute_window_loop(win, [&](const Coordinates & id)
     {

         int x = window_start_x;
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
             int32x4x4_t v_in =
             {
                 {
                     wrapper::vloadq(in_ptr),
                     wrapper::vloadq(in_ptr + 4),
                     wrapper::vloadq(in_ptr + 8),
                     wrapper::vloadq(in_ptr + 12)
                 }
             };

             // Accumulate bias
             if(has_bias)
             {
                 const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
                 v_in =
                 {
                     {
                         wrapper::vadd(v_in.val[0], vb),
                         wrapper::vadd(v_in.val[1], vb),
                         wrapper::vadd(v_in.val[2], vb),
                         wrapper::vadd(v_in.val[3], vb)
                     }
                 };
             }

             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
             wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
                                                            min, max, false));
         }

         // Left-overs loop
         for(; x < window_end_x; ++x)
         {
             // Get bias and pointer to input
             int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);

             // Accumulate bias
             if(has_bias)
             {
                 const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
                 s_in += b;
             }

             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
             *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
                                                        std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
         }
     },
     in, out);
 }
 template < typename TOut, typename std::enable_if < std::is_same<TOut, uint8_t>::value || std::is_same<TOut, int8_t>::value, int >::type = 0 >
 void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
                        int result_fixedpoint_multiplier, int result_shift, int result_offset_after_shift, bool has_bias)
 {
     using VectorType = typename wrapper::traits::neon_bitvector_t<TOut, wrapper::traits::BitWidth::W128>;
     using TagType    = typename wrapper::traits::neon_bitvector_tag_t<TOut, wrapper::traits::BitWidth::W128>;

     const int32x4_t result_offset_after_shift_s32 = vdupq_n_s32(result_offset_after_shift);

     const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
     const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});

     Window window_bias = window;
     window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
     window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
     window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
     window_bias.set(3, Window::Dimension(0, 0, 0));

     const int window_start_x = window.x().start();
     const int window_end_x   = window.x().end();
     const int window_step_x  = 16 / input->info()->element_size();
     Window    win            = window;
     win.set(Window::DimX, Window::Dimension(0, 1, 1));

     Iterator in(input, win);
     Iterator bi(bias, window_bias);
     Iterator out(output, win);

     execute_window_loop(win, [&](const Coordinates &)
     {
         int x = window_start_x;
         for(; x <= (window_end_x - window_step_x); x += window_step_x)
         {
             // Get bias and pointer to input
             const auto  in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
             int32x4x4_t v_in =
             {
                 {
                     wrapper::vloadq(in_ptr),
                     wrapper::vloadq(in_ptr + 4),
                     wrapper::vloadq(in_ptr + 8),
                     wrapper::vloadq(in_ptr + 12),
                 }
             };

             // Accumulate bias
             if(has_bias)
             {
                 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;

                 wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
                 wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
                 wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
                 wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
             }

             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
             wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
         }

         // Left-overs loop
         for(; x < window_end_x; ++x)
         {
             // Get bias and pointer to input
             const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
             int32_t    s_in   = *in_ptr;

             // Accumulate bias
             if(has_bias)
             {
                 const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
                 s_in += *bias_ptr;
             }

             const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
             *out_ptr           = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
                                                        std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
         }
     },
     in, bi, out);
 }
 } // namespace

 NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel()
     : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr), _result_fixedpoint_multiplier(0), _result_shift(0), _result_offset_after_shift(0)
 {
 }

 void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const ITensor *bias, ITensor *output,
                                                           const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     // Perform validation step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input);
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info));

     _func                         = nullptr;
     _bias                         = bias;
     _input                        = input;
     _output                       = (output != nullptr) ? output : input;
     _result_fixedpoint_multiplier = info.result_fixedpoint_multiplier;
     _result_shift                 = info.result_shift;
     _result_offset_after_shift    = info.result_offset_after_shift;

     // Auto-initialize output output if required
     if(output != nullptr && output->info() != nullptr)
     {
         // Work out expected output data type
         const DataType output_dt = (input->info()->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
         // Output tensor auto initialization if not yet initialized
         auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
     }

     Window      win = calculate_max_window(*input->info(), Steps());
     Coordinates coord;
     coord.set_num_dimensions(input->info()->num_dimensions());

     if(output != nullptr && (output->info()->total_size() != 0))
     {
         output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
     }
     else
     {
         input->info()->set_valid_region(ValidRegion(coord, input->info()->tensor_shape()));
     }

     INEKernel::configure(win);

     const bool is_qasymm8_signed = (output != nullptr) ? is_data_type_quantized_asymmetric_signed(output->info()->data_type()) : false;

     // Set appropriate function
     if(input->info()->data_layout() == DataLayout::NCHW)
     {
         switch(input->info()->data_type())
         {
             case DataType::S32:
             {
                 if(is_qasymm8_signed)
                 {
                     _func = &output_stage_nchw<int8_t>;
                 }
                 else
                 {
                     _func = &output_stage_nchw<uint8_t>;
                 }
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 _func = &output_stage_nchw<float16_t>;
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
                 _func = &output_stage_nchw<float>;
                 break;
             }
             default:
             {
                 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
             }
         }
     }
     else
     {
         switch(input->info()->data_type())
         {
             case DataType::S32:
             {
                 if(is_qasymm8_signed)
                 {
                     _func = &output_stage_nhwc<int8_t>;
                 }
                 else
                 {
                     _func = &output_stage_nhwc<uint8_t>;
                 }
                 break;
             }
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
             {
                 _func = &output_stage_nhwc<float16_t>;
                 break;
             }
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::F32:
             {
                 _func = &output_stage_nhwc<float>;
                 break;
             }
             default:
             {
                 ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs.");
             }
         }
     }
 }

 Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
                                                            const DirectConvolutionLayerOutputStageKernelInfo &info)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));

     return Status{};
 }

 void NEDirectConvolutionLayerOutputStageKernel::run(const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
     ARM_COMPUTE_ERROR_ON(_func == nullptr);

     const bool has_bias = _bias != nullptr;
     (*_func)(_input, _bias, window, _output, _result_fixedpoint_multiplier, _result_shift, _result_offset_after_shift, has_bias);
 }
 } // namespace arm_compute
arm_compute::DirectConvolutionLayerOutputStageKernelInfo::result_fixedpoint_multiplier
int32_t result_fixedpoint_multiplier
Result output stage multiplier used for quantizing.
Definition: KernelDescriptors.h:122

arm_compute::ITensorInfo::num_dimensions
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)

arm_compute::calculate_max_window
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Definition: WindowHelpers.cpp:28

WindowHelpers.h

arm_compute::IKernel::window
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28

ITensor.h

ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108

arm_compute::DirectConvolutionLayerOutputStageKernelInfo::result_offset_after_shift
int32_t result_offset_after_shift
Result offset used for quantizing.
Definition: KernelDescriptors.h:124

arm_compute::test::validation::b
SimpleTensor< float > b
Definition: DFT.cpp:157

ARM_COMPUTE_ERROR
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

arm_compute::wrapper::vloadq
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58

ARM_COMPUTE_RETURN_ON_ERROR
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204

arm_compute::ITensorInfo::data_type
virtual DataType data_type() const =0
Data type used for each element of the tensor.

arm_compute::wrapper::vadd
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39

Window.h

arm_compute::Format::F32
1 channel, 1 F32 per channel

ARM_COMPUTE_ERROR_ON
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466

arm_compute::ITensorInfo
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40

ARM_COMPUTE_ERROR_THROW_ON
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455

arm_compute::Status
Status class.
Definition: Error.h:52

NEAsymm.h

ARM_COMPUTE_RETURN_ERROR_ON
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296

type
decltype(strategy::transforms) typedef type
Definition: gemm_interleaved.hpp:227

arm_compute::ITensor
Interface for Neon tensor.
Definition: ITensor.h:36

arm_compute::support::cpp11::lowest
T lowest()
Definition: ToolchainSupport.h:247

arm_compute::NEDirectConvolutionLayerOutputStageKernel::run
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Definition: NEDirectConvolutionLayerOutputStageKernel.cpp:505

arm_compute::NEDirectConvolutionLayerOutputStageKernel::NEDirectConvolutionLayerOutputStageKernel
NEDirectConvolutionLayerOutputStageKernel()
Default constructor.
Definition: NEDirectConvolutionLayerOutputStageKernel.cpp:380

arm_compute
Copyright (c) 2017-2021 Arm Limited.
Definition: 00_introduction.dox:24

arm_compute::ITensorInfo::set_valid_region
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.

arm_compute::Format::F16
1 channel, 1 F16 per channel

arm_compute::test::validation::input
auto input
Definition: LSTMLayerQuantized.cpp:486

ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163

arm_compute::Format::S32
1 channel, 1 S32 per channel

arm_compute::Window::DimX
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43

ARM_COMPUTE_UNUSED
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

arm_compute::NEDirectConvolutionLayerOutputStageKernel::configure
void configure(ITensor *input, const ITensor *bias=nullptr, ITensor *output=nullptr, const DirectConvolutionLayerOutputStageKernelInfo &info=DirectConvolutionLayerOutputStageKernelInfo())
Set the accumulate buffer and the biases of the kernel.
Definition: NEDirectConvolutionLayerOutputStageKernel.cpp:385

arm_compute::ITensorInfo::tensor_shape
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.

arm_compute::DataType::QASYMM8
quantized, asymmetric fixed-point 8-bit number unsigned

arm_compute::Steps
Class to describe a number of elements in each dimension.
Definition: Steps.h:40

arm_compute::Coordinates
Coordinates of an item.
Definition: Coordinates.h:37

arm_compute::is_data_type_quantized_asymmetric_signed
bool is_data_type_quantized_asymmetric_signed(DataType dt)
Check if a given data type is of asymmetric quantized signed type.
Definition: Utils.h:1209

arm_compute::auto_init_if_empty
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: AutoConfiguration.h:42

arm_compute::misc::ICloneable::clone
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.

Validate.h

arm_compute::ITensor::info
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.

arm_compute::test::validation::has_bias
const bool has_bias
Definition: Im2Col.cpp:152

Error.h

arm_compute::DataLayoutDimension::CHANNEL
channel

NEDirectConvolutionLayerOutputStageKernel.h

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941

arm_compute::DataLayout::NCHW
Num samples, channels, height, width.

arm_compute::Window::DimY
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45

arm_compute::test::validation::info
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)

arm_compute::NEDirectConvolutionLayerOutputStageKernel::validate
static Status validate(const ITensorInfo *input, const ITensorInfo *bias=nullptr, const ITensorInfo *output=nullptr, const DirectConvolutionLayerOutputStageKernelInfo &info=DirectConvolutionLayerOutputStageKernelInfo())
Static function to check if given info will lead to a valid configuration of NEDirectConvolutionLayer...
Definition: NEDirectConvolutionLayerOutputStageKernel.cpp:497

AutoConfiguration.h

arm_compute::ThreadInfo
Information about executing thread and CPU.
Definition: CPPTypes.h:235

arm_compute::ITensorInfo::total_size
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443

arm_compute::Window::DimZ
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47

arm_compute::CLVersion::UNKNOWN

arm_compute::DirectConvolutionLayerOutputStageKernelInfo::output_data_type
DataType output_data_type
Output tensor data type to use if the output is not initialized.
Definition: KernelDescriptors.h:125

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792

arm_compute::validate_arguments
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
Definition: NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp:45

Validate.h

arm_compute::wrapper::vstore
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39

ARM_COMPUTE_RETURN_ERROR_ON_MSG
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244

ARM_COMPUTE_ERROR_ON_NULLPTR
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161

Helpers.h

arm_compute::wrapper::vdup_n
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41

arm_compute::execute_window_loop
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77

AccessWindowStatic.h

arm_compute::Dimensions::set_num_dimensions
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149

arm_compute::DataType::QASYMM8_SIGNED
quantized, asymmetric fixed-point 8-bit number signed

wrapper.h
Includes all wrapper headers at once.

arm_compute::ValidRegion
Container for valid region of a window.
Definition: Types.h:188

arm_compute::get_data_layout_dimension_index
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193

Types.h

arm_compute::DataType
DataType
Available data types.
Definition: Types.h:77

NEFixedPoint.h

arm_compute::Window
Describe a multidimensional execution window.
Definition: Window.h:39

arm_compute::finalize_quantization
wrapper::traits::neon_vector< T, 16 >::type finalize_quantization(int32x4x4_t &in_s32, int32x4_t result_shift_s32, typename wrapper::traits::neon_vector< T, 16 >::type min, typename wrapper::traits::neon_vector< T, 16 >::type max)
Definition: NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp:106

arm_compute::is_data_type_float
bool is_data_type_float(DataType dt)
Check if a given data type is of floating point type.
Definition: Utils.h:1148

arm_compute::DirectConvolutionLayerOutputStageKernelInfo
Descriptor used by the direct convolution layer output stage kernels.
Definition: KernelDescriptors.h:120

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205

arm_compute::ITensorInfo::data_layout
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.

arm_compute::DirectConvolutionLayerOutputStageKernelInfo::result_shift
int32_t result_shift
Result output stage shift used for quantizing.
Definition: KernelDescriptors.h:123

Traits.h