ComputeLibrary/v21.02/_c_l_direct_convolution_layer_kernel_8cpp_source.xhtml

 /*
  * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to
  * deal in the Software without restriction, including without limitation the
  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
  * sell copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in all
  * copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
 #include "src/core/CL/kernels/CLDirectConvolutionLayerKernel.h"

 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/Helpers.h"
 #include "arm_compute/core/ITensor.h"
 #include "arm_compute/core/PixelValue.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "src/core/AccessWindowStatic.h"
 #include "src/core/CL/CLValidate.h"
 #include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/WindowHelpers.h"
 #include "support/StringSupport.h"

 namespace arm_compute
 {
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32);
     ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);

     const DataLayout data_layout = input->data_layout();
     const int        width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
     const int        height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
     const int        channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);

     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx),
                                     "Weights feature map dimension should match the respective input's one");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution.");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9)
                                     && std::get<0>(conv_info.stride()) > 2,
                                     "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution.");

     if(data_layout == DataLayout::NCHW)
     {
         if(is_data_type_quantized(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9,
                                             "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types");
         }
         else
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5,
                                             "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types");
         }
     }

     if(biases != nullptr)
     {
         if(is_data_type_quantized_asymmetric(input->data_type()))
         {
             ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
         }
         else
         {
             ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
         }
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3),
                                         "Biases size and number of input feature maps should match");
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1,
                                         "Biases should be one dimensional");
     }

     // Checks performed when output is configured
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(),
                                                            misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info));
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }

     const auto data_type = input->data_type();
     if(is_data_type_quantized(data_type))
     {
         const UniformQuantizationInfo iqinfo = input->quantization_info().uniform();
         const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform();
         const UniformQuantizationInfo oqinfo = output->quantization_info().uniform();

         float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
         int   output_multiplier = 0;
         int   output_shift      = 0;
         ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift));
     }
     return Status{};
 }

 inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size,
                                                       DataType data_type, DataLayout data_layout)
 {
     return gpu_target_is_in(gpu_target,
                             GPUTarget::G71, GPUTarget::G72, GPUTarget::G76,
                             GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT,
                             GPUTarget::G52, GPUTarget::G52LIT)
            && (kernel_size <= 5)
            && (conv_stride_x == 1) && (conv_stride_y == 1)
            && (data_type == DataType::F32)
            && (data_layout == DataLayout::NCHW);
 }

 inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y,
                                  unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y,
                                  unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *input)
 {
     const DataType   data_type     = input->data_type();
     const DataLayout data_layout   = input->data_layout();
     unsigned int     conv_stride_x = std::get<0>(conv_info.stride());
     unsigned int     conv_stride_y = std::get<1>(conv_info.stride());

     const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout);

     if(run_optimized_bifrost)
     {
         // Configure kernel window
         switch(kernel_size)
         {
             case 1:
             {
                 num_elems_read_per_iteration_x    = 4;
                 num_elems_read_per_iteration_y    = 4;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 4;
                 break;
             }
             case 3:
             {
                 num_elems_read_per_iteration_x    = 6;
                 num_elems_read_per_iteration_y    = 5;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 3;
                 break;
             }
             case 5:
             {
                 num_elems_read_per_iteration_x    = 8;
                 num_elems_read_per_iteration_y    = 6;
                 num_elems_written_per_iteration_x = 4;
                 num_elems_written_per_iteration_y = 2;
                 break;
             }
             default:
             {
                 ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost");
             }
         }
     }
     else
     {
         num_elems_read_per_iteration_y    = kernel_size;
         num_elems_written_per_iteration_x = 8;
         num_elems_written_per_iteration_y = 1;
         switch(kernel_size)
         {
             case 1:
                 switch(conv_stride_x)
                 {
                     case 1:
                         num_elems_read_per_iteration_x = 8;
                         break;
                     case 2:
                         num_elems_read_per_iteration_x = 16;
                         break;
                     case 3:
                         switch(input->element_size())
                         {
                             case 1:
                                 num_elems_read_per_iteration_x = 28;
                                 break;
                             case 2:
                                 num_elems_read_per_iteration_x = 24;
                                 break;
                             case 4:
                                 num_elems_read_per_iteration_x = 22;
                                 break;
                             default:
                                 ARM_COMPUTE_ERROR("Invalid data size");
                         }
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
                 }
                 break;
             case 3:
                 switch(conv_stride_x)
                 {
                     case 1:
                         num_elems_read_per_iteration_x = 10;
                         break;
                     case 2:
                         num_elems_read_per_iteration_x = 17;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
                 }
                 break;
             case 5:
                 switch(conv_stride_x)
                 {
                     case 1:
                         num_elems_read_per_iteration_x = 12;
                         break;
                     case 2:
                         num_elems_read_per_iteration_x = 20;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
                 }
                 break;
             case 9:
                 switch(conv_stride_x)
                 {
                     case 1:
                         num_elems_read_per_iteration_x = 16;
                         break;
                     case 2:
                         num_elems_read_per_iteration_x = 24;
                         break;
                     default:
                         ARM_COMPUTE_ERROR("Invalid convolution stride X");
                 }
                 break;
             default:
                 ARM_COMPUTE_ERROR("Invalid direct convolution size");
         }
     }
 }

 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
 {
     const DataLayout data_layout = input->data_layout();

     // Get output shape
     TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*input, *weights, conv_info);

     // Output auto inizialitation if not yet initialized
     auto_init_if_empty(*output, output_shape,
                        1,
                        input->data_type(),
                        input->quantization_info());

     if(data_layout == DataLayout::NHWC)
     {
         const unsigned int vec_size = std::min(static_cast<unsigned int>(output->tensor_shape()[0]), 4u);

         // Create window and update padding
         Window win = calculate_max_window(*output, Steps(vec_size, 1U));
         output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape()));
         Status err = Status{};
         return std::make_pair(err, win);
     }
     else if(data_layout == DataLayout::NCHW)
     {
         const int          width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
         const unsigned int kernel_size = weights->dimension(width_idx);

         unsigned int num_elems_read_per_iteration_x    = 0;
         unsigned int num_elems_read_per_iteration_y    = 0;
         unsigned int num_elems_written_per_iteration_x = 0;
         unsigned int num_elems_written_per_iteration_y = 0;

         unsigned int conv_pad_left = conv_info.pad_left();
         unsigned int conv_pad_top  = conv_info.pad_top();
         unsigned int conv_stride_x = std::get<0>(conv_info.stride());
         unsigned int conv_stride_y = std::get<1>(conv_info.stride());

         setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y,
                              num_elems_written_per_iteration_x, num_elems_written_per_iteration_y,
                              kernel_size, conv_info, target, input);

         // Create window and update padding
         bool   window_changed = false;
         Window win            = calculate_max_window(*output, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y));

         AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y);
         AccessWindowStatic    weights_access(weights, 0, 0, kernel_size, kernel_size);
         AccessWindowRectangle output_access(output, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y);
         window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
         output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
         Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
         return std::make_pair(err, win);
     }
     else
     {
         ARM_COMPUTE_ERROR("Not supported");
     }
 }
 } // namespace

 CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel()
     : _input(nullptr), _biases(nullptr), _weights(nullptr), _output(nullptr), _data_layout(DataLayout::UNKNOWN), _border_size(0), _conv_stride_x(0), _conv_stride_y(0), _conv_info()
 {
 }

 BorderSize CLDirectConvolutionLayerKernel::border_size() const
 {
     return _border_size;
 }

 void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
 {
     configure(CLKernelLibrary::get().get_compile_context(), input, weights, biases, output, conv_info);
 }

 void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_context, const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output,
                                                const PadStrideInfo &conv_info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);

     // Perform validation
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
                                                   weights->info(),
                                                   (biases != nullptr) ? biases->info() : nullptr,
                                                   output->info(),
                                                   conv_info));

     _conv_stride_x = std::get<0>(conv_info.stride());
     _conv_stride_y = std::get<1>(conv_info.stride());
     _data_layout   = input->info()->data_layout();
     _input         = input;
     _weights       = weights;
     _output        = output;
     _biases        = biases;
     _conv_info     = conv_info;

     const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
     const unsigned int kernel_size = weights->info()->dimension(width_idx);
     const DataType     data_type   = input->info()->data_type();

     const GPUTarget gpu_target = get_target();

     // Configure kernel window
     auto win_config = validate_and_configure_window(input->info(), weights->info(), output->info(), conv_info, gpu_target);
     ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
     ICLKernel::configure_internal(win_config.second);

     std::stringstream kernel_name;
     CLBuildOptions    build_options;

     if(_data_layout == DataLayout::NHWC)
     {
         _border_size = BorderSize();

         kernel_name << "direct_convolution_nhwc";

         const unsigned int n0               = win_config.second.x().step();
         const unsigned int m0               = win_config.second.y().step();
         const unsigned int k0               = adjust_vec_size(16u, _input->info()->dimension(channel_idx));
         const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0;
         const unsigned int partial_store_m0 = (_output->info()->dimension(width_idx) * _output->info()->dimension(height_idx)) % m0;
         const unsigned int pad_left         = conv_info.pad_left();
         const unsigned int pad_top          = conv_info.pad_top();

         if(_biases != nullptr)
         {
             build_options.add_option(std::string("-DHAS_BIAS"));
             build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(_biases->info()->data_type())));
         }
         build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(_input->info()->dimension(width_idx)));
         build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(_input->info()->dimension(height_idx)));
         build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(_input->info()->dimension(channel_idx)));
         build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(_input->info()->data_type()));
         build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(_output->info()->dimension(width_idx)));
         build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(_output->info()->dimension(height_idx)));
         build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(_output->info()->dimension(channel_idx)));
         build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(_output->info()->data_type()));
         build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(_weights->info()->dimension(width_idx)));
         build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(_weights->info()->dimension(height_idx)));
         build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(_weights->info()->data_type()));
         build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x));
         build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(_conv_stride_y));
         build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left));
         build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top));
         build_options.add_option("-DN0=" + support::cpp11::to_string(n0));
         build_options.add_option("-DM0=" + support::cpp11::to_string(m0));
         build_options.add_option("-DK0=" + support::cpp11::to_string(k0));
         build_options.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0));
         build_options.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0));

         if(is_data_type_quantized(data_type))
         {
             const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
             const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
             const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();

             PixelValue zero_value = PixelValue(0, input->info()->data_type(), input->info()->quantization_info());
             int        zero_value_s32;
             zero_value.get(zero_value_s32);

             float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
             int   output_multiplier = 0;
             int   output_shift      = 0;
             quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
             build_options.add_option("-DIS_QUANTIZED");
             build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
             build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift));
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset));
             build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32));
             build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32));
         }
         else
         {
             build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type));
             build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0));
             build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0));
         }
     }
     else
     {
         _border_size = BorderSize(_input->info()->padding());

         kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size;

         build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS"));

         const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, _conv_stride_x, _conv_stride_y, kernel_size, data_type, _data_layout);

         if(run_optimized_for_bifrost)
         {
             build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));

             kernel_name << "_f32_bifrost";
         }
         else
         {
             build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)));
             build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type)));
             build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(channel_idx))));
             build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(_conv_stride_x)));
             build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type)));

             if(is_data_type_quantized(data_type))
             {
                 const UniformQuantizationInfo iqinfo = _input->info()->quantization_info().uniform();
                 const UniformQuantizationInfo wqinfo = _weights->info()->quantization_info().uniform();
                 const UniformQuantizationInfo oqinfo = _output->info()->quantization_info().uniform();

                 float multiplier        = iqinfo.scale * wqinfo.scale / oqinfo.scale;
                 int   output_multiplier = 0;
                 int   output_shift      = 0;
                 quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift);
                 build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier));
                 build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift));
                 build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size));
                 build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset));
                 build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset));
                 build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset));

                 kernel_name.str("direct_convolution_quantized");
             }
         }
     }

     _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options());

     // Set config_id for enabling LWS tuning
     _config_id = kernel_name.str();
     _config_id += "_";
     _config_id += lower_string(string_from_data_type(data_type));
     _config_id += "_";
     _config_id += support::cpp11::to_string(kernel_size);
     _config_id += "_";
     _config_id += support::cpp11::to_string(border_size().left);
     _config_id += "_";
     _config_id += support::cpp11::to_string(border_size().top);
     _config_id += "_";
     _config_id += support::cpp11::to_string(border_size().right);
     _config_id += "_";
     _config_id += support::cpp11::to_string(border_size().bottom);
     _config_id += "_";
     _config_id += support::cpp11::to_string(_conv_stride_x);
     _config_id += "_";
     _config_id += support::cpp11::to_string(_conv_stride_y);
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(width_idx));
     _config_id += "_";
     _config_id += support::cpp11::to_string(output->info()->dimension(height_idx));
     _config_id += "_";
     _config_id += lower_string(string_from_data_layout(_data_layout));
 }

 Status CLDirectConvolutionLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
                                                 const GPUTarget target)
 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), weights->clone().get(), output->clone().get(), conv_info, target).first);

     return Status{};
 }

 void CLDirectConvolutionLayerKernel::run(const Window &window, cl::CommandQueue &queue)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);

     // Get initial windows
     Window slice = window.first_slice_window_3D();

     if(_data_layout == DataLayout::NHWC)
     {
         slice.set(Window::DimY, Window::Dimension(0, _output->info()->dimension(1) * _output->info()->dimension(2), 1));
         slice.set(Window::DimZ, Window::Dimension(0, _output->info()->dimension(3), 1));

         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, slice);
         add_3D_tensor_argument(idx, _output, slice);
         add_3D_tensor_argument(idx, _weights, slice);
         if(_biases != nullptr)
         {
             add_1D_tensor_argument(idx, _biases, slice);
         }
         _kernel.setArg(idx++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));
         enqueue(queue, *this, slice, lws_hint());
     }
     else
     {
         Window win_in = window;

         win_in.adjust(Window::DimX, -_conv_info.pad_left(), true);
         win_in.adjust(Window::DimY, -_conv_info.pad_top(), true);

         const int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
         const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);

         win_in.set_dimension_step(width_idx, window[width_idx].step() * _conv_stride_x);
         win_in.set_dimension_step(height_idx, window[height_idx].step() * _conv_stride_y);

         Window       slice_in = win_in.first_slice_window_3D();
         unsigned int idx1     = 2 * num_arguments_per_3D_tensor();
         add_3D_tensor_argument(idx1, _weights, slice);

         if(_biases != nullptr)
         {
             Window slice_biases;
             slice_biases.use_tensor_dimensions(_biases->info()->tensor_shape());
             add_1D_tensor_argument(idx1, _biases, slice_biases);
         }

         _kernel.setArg(idx1++, static_cast<unsigned int>(_weights->info()->strides_in_bytes()[3]));

         do
         {
             unsigned int idx = 0;
             add_3D_tensor_argument(idx, _input, slice_in);
             add_3D_tensor_argument(idx, _output, slice);
             enqueue(queue, *this, slice, lws_hint());
         }
         while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in));
     }
 }
 } // namespace arm_compute
arm_compute::is_data_type_quantized
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Definition: Utils.h:1168

PixelValue.h

conv_pad_top
const size_t conv_pad_top
Definition: NEDepthwiseConvolutionLayerNativeKernel.cpp:67

arm_compute::PixelValue
Class describing the value of a pixel for any image format.
Definition: PixelValue.h:34

arm_compute::calculate_max_window
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Definition: WindowHelpers.cpp:28

ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED
#define ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(tensor)
Definition: CLValidate.h:35

WindowHelpers.h

arm_compute::IKernel::window
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28

arm_compute::GPUTarget::G76

arm_compute::CLDirectConvolutionLayerKernel::_weights
const ICLTensor * _weights
Definition: CLDirectConvolutionLayerKernel.h:117

ITensor.h

arm_compute::enqueue
void enqueue(IGCKernel &kernel, const Window &window, const gles::NDRange &lws=gles::NDRange(1U, 1U, 1U))
Add the kernel to the command queue with the given window.
Definition: IGCKernel.cpp:41

arm_compute::ITensorInfo::dimension
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.

arm_compute::BorderSize
Container for 2D border size.
Definition: Types.h:273

arm_compute::CLBuildOptions::options
const StringSet & options() const
Gets the current options list set.
Definition: CLCompileContext.cpp:70

arm_compute::misc::shape_calculator::compute_deep_convolution_shape
TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
Calculate the deep convolution shape output shape of a tensor.
Definition: ShapeCalculator.h:738

ARM_COMPUTE_ERROR
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

arm_compute::ICLKernel::lws_hint
cl::NDRange lws_hint() const
Return the Local-Workgroup-Size hint.
Definition: ICLKernel.h:276

arm_compute::test::validation::conv_info
conv_info
Definition: Winograd.cpp:599

arm_compute::PixelValue::get
void get(uint8_t &v) const
Interpret the pixel value as a U8.
Definition: PixelValue.h:241

ARM_COMPUTE_RETURN_ON_ERROR
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204

arm_compute::support::cpp11::to_string
std::string to_string(T &&value)
Convert integer and float values to string.
Definition: StringSupport.h:162

arm_compute::GPUTarget::G52LIT

arm_compute::ITensorInfo::data_type
virtual DataType data_type() const =0
Data type used for each element of the tensor.

arm_compute::GPUTarget::G71

arm_compute::Format::F32
1 channel, 1 F32 per channel

arm_compute::DataLayoutDimension::HEIGHT
height

arm_compute::CLDirectConvolutionLayerKernel::_output
ICLTensor * _output
Definition: CLDirectConvolutionLayerKernel.h:118

arm_compute::test::validation::data_layout
const DataLayout data_layout
Definition: Im2Col.cpp:151

arm_compute::CLKernelLibrary::get
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Definition: CLKernelLibrary.cpp:1119

arm_compute::ITensorInfo
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40

ARM_COMPUTE_ERROR_THROW_ON
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455

arm_compute::UniformQuantizationInfo
Quantization info when assuming per layer quantization.
Definition: QuantizationInfo.h:43

arm_compute::Window::Dimension
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77

arm_compute::UniformQuantizationInfo::scale
float scale
Definition: QuantizationInfo.h:65

arm_compute::PadStrideInfo::pad_top
unsigned int pad_top() const
Get the top padding.
Definition: Types.h:806

arm_compute::quantization::calculate_quantized_multiplier
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
Definition: AsymmHelpers.cpp:39

arm_compute::Status
Status class.
Definition: Error.h:52

arm_compute::GPUTarget::G51LIT

CLHelpers.h

arm_compute::lower_string
std::string lower_string(const std::string &val)
Lower a given string.
Definition: Utils.cpp:350

CLDirectConvolutionLayerKernel.h

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:288

arm_compute::CLDirectConvolutionLayerKernel::validate
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const GPUTarget target)
Static function to check if given info will lead to a valid configuration of CLDirectConvolutionLayer...
Definition: CLDirectConvolutionLayerKernel.cpp:516

arm_compute::ICLKernel::add_3D_tensor_argument
void add_3D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 3D tensor&#39;s parameters to the object&#39;s kernel&#39;s arguments starting from the index idx...
Definition: ICLKernel.h:172

arm_compute::Window::use_tensor_dimensions
void use_tensor_dimensions(const TensorShape &shape, size_t first_dimension=Window::DimX)
Use the tensor&#39;s dimensions to fill the window dimensions.
Definition: Window.inl:276

arm_compute
Copyright (c) 2017-2021 Arm Limited.
Definition: 00_introduction.dox:24

arm_compute::Format::F16
1 channel, 1 F16 per channel

arm_compute::test::validation::input
auto input
Definition: LSTMLayerQuantized.cpp:486

arm_compute::CLDirectConvolutionLayerKernel::_data_layout
DataLayout _data_layout
Definition: CLDirectConvolutionLayerKernel.h:119

arm_compute::Format::S32
1 channel, 1 S32 per channel

arm_compute::CLBuildOptions::add_option
void add_option(std::string option)
Adds option to the existing build option list.
Definition: CLCompileContext.cpp:39

Utils.h

arm_compute::test::validation::data_type
const DataType data_type
Definition: Im2Col.cpp:150

CLValidate.h

arm_compute::create_kernel
cl::Kernel create_kernel(const CLCompileContext &ctx, const std::string &kernel_name, const std::set< std::string > &build_opts=std::set< std::string >())
Creates an opencl kernel using a compile context.
Definition: CLHelpers.cpp:403

arm_compute::string_from_data_type
const std::string & string_from_data_type(DataType dt)
Convert a data type identity into a string.
Definition: Utils.cpp:135

StringSupport.h

arm_compute::get_data_size_from_data_type
std::string get_data_size_from_data_type(const DataType &dt)
Get the size of a data type in number of bits.
Definition: CLHelpers.cpp:191

arm_compute::Window::DimX
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43

arm_compute::update_window_and_padding
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46

arm_compute::ICLKernel::num_arguments_per_3D_tensor
static constexpr unsigned int num_arguments_per_3D_tensor()
Returns the number of arguments enqueued per 3D tensor object.
Definition: ICLKernel.h:214

arm_compute::test::validation::output_shape
TensorShape output_shape
Definition: LSTMLayerQuantized.cpp:469

arm_compute::ITensorInfo::tensor_shape
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.

arm_compute::CLDirectConvolutionLayerKernel::_conv_info
PadStrideInfo _conv_info
Definition: CLDirectConvolutionLayerKernel.h:123

arm_compute::DataType::QASYMM8
quantized, asymmetric fixed-point 8-bit number unsigned

build_options
std::set< std::string > build_options
Definition: CLIm2ColKernel.cpp:53

arm_compute::CLDirectConvolutionLayerKernel::run
void run(const Window &window, cl::CommandQueue &queue) override
Enqueue the OpenCL kernel to process the given window on the passed OpenCL command queue...
Definition: CLDirectConvolutionLayerKernel.cpp:525

arm_compute::PadStrideInfo::stride
std::pair< unsigned int, unsigned int > stride() const
Get the stride.
Definition: Types.h:770

conv_stride_x
const size_t conv_stride_x
Definition: NEDepthwiseConvolutionLayerNativeKernel.cpp:64

kernel_name
std::string kernel_name
Definition: CLIm2ColKernel.cpp:52

arm_compute::ICLKernel::get_target
GPUTarget get_target() const
Get the targeted GPU architecture.
Definition: ICLKernel.h:336

arm_compute::QuantizationInfo::uniform
UniformQuantizationInfo uniform() const
Return per layer quantization info.
Definition: QuantizationInfo.h:149

arm_compute::get_cl_type_from_data_type
std::string get_cl_type_from_data_type(const DataType &dt)
Translates a tensor data type to the appropriate OpenCL type.
Definition: CLHelpers.cpp:37

arm_compute::auto_init_if_empty
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Definition: AutoConfiguration.h:42

arm_compute::CLDirectConvolutionLayerKernel::_border_size
BorderSize _border_size
Definition: CLDirectConvolutionLayerKernel.h:120

ShapeCalculator.h

arm_compute::CLDirectConvolutionLayerKernel::_conv_stride_y
int _conv_stride_y
Definition: CLDirectConvolutionLayerKernel.h:122

arm_compute::misc::ICloneable::clone
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.

arm_compute::ITensor::info
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.

arm_compute::CLBuildOptions::add_option_if
void add_option_if(bool cond, std::string option)
Adds option if a given condition is true;.
Definition: CLCompileContext.cpp:44

arm_compute::GPUTarget::G72

arm_compute::PadStrideInfo
Padding and stride information class.
Definition: Types.h:722

arm_compute::Window::set
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49

arm_compute::ITensorInfo::padding
virtual PaddingSize padding() const =0
Padding of tensor.

arm_compute::CLBuildOptions
Build options.
Definition: CLCompileContext.h:38

arm_compute::DataLayoutDimension::CHANNEL
channel

arm_compute::Window::slide_window_slice_3D
bool slide_window_slice_3D(Window &slice) const
Slide the passed 3D window slice.
Definition: Window.h:335

arm_compute::ErrorCode::RUNTIME_ERROR
Generic runtime error.

arm_compute::ITensorInfo::quantization_info
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.

ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941

arm_compute::DataLayout::NCHW
Num samples, channels, height, width.

arm_compute::CLCompileContext
CLCompileContext class.
Definition: CLCompileContext.h:202

arm_compute::is_data_type_quantized_asymmetric
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
Definition: Utils.h:1190

arm_compute::CLDirectConvolutionLayerKernel::_input
const ICLTensor * _input
Definition: CLDirectConvolutionLayerKernel.h:115

arm_compute::CLDirectConvolutionLayerKernel::_conv_stride_x
int _conv_stride_x
Definition: CLDirectConvolutionLayerKernel.h:121

conv_stride_y
const size_t conv_stride_y
Definition: NEDepthwiseConvolutionLayerNativeKernel.cpp:65

arm_compute::CLDirectConvolutionLayerKernel::border_size
BorderSize border_size() const override
The size of the border for that kernel.
Definition: CLDirectConvolutionLayerKernel.cpp:324

arm_compute::Window::DimY
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45

arm_compute::Window::set_dimension_step
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
Definition: Window.inl:167

arm_compute::CLDirectConvolutionLayerKernel::CLDirectConvolutionLayerKernel
CLDirectConvolutionLayerKernel()
Default constructor.
Definition: CLDirectConvolutionLayerKernel.cpp:319

arm_compute::ICLTensor
Interface for OpenCL tensor.
Definition: ICLTensor.h:42

AutoConfiguration.h

arm_compute::string_from_data_layout
const std::string & string_from_data_layout(DataLayout dl)
Convert a data layout identity into a string.
Definition: Utils.cpp:123

ICLTensor.h

ARM_COMPUTE_CREATE_ERROR
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159

arm_compute::Window::DimZ
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47

arm_compute::GPUTarget
GPUTarget
Available GPU Targets.
Definition: GPUTarget.h:34

arm_compute::CLDirectConvolutionLayerKernel::_biases
const ICLTensor * _biases
Definition: CLDirectConvolutionLayerKernel.h:116

arm_compute::CLVersion::UNKNOWN

arm_compute::cpu::step
constexpr int step
Definition: fp32.cpp:35

CLKernelLibrary.h
Manages all the OpenCL kernels compilation and caching, provides accessors for the OpenCL Context...

ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545

arm_compute::DataLayout::NHWC
Num samples, height, width, channels.

ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792

arm_compute::validate_arguments
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
Definition: NEGEMMLowpQuantizeDownInt32ScaleKernel.cpp:45

arm_compute::UniformQuantizationInfo::offset
int32_t offset
Definition: QuantizationInfo.h:66

ARM_COMPUTE_RETURN_ERROR_ON_MSG
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244

arm_compute::DataLayoutDimension::WIDTH
width

ARM_COMPUTE_ERROR_ON_NULLPTR
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161

Helpers.h

AccessWindowStatic.h

arm_compute::GPUTarget::G51BIG

arm_compute::adjust_vec_size
unsigned int adjust_vec_size(unsigned int vec_size, size_t dim0)
Returns the adjusted vector size in case it is less than the input&#39;s first dimension, getting rounded down to its closest valid vector size.
Definition: Utils.h:1358

arm_compute::DataType::QASYMM8_SIGNED
quantized, asymmetric fixed-point 8-bit number signed

arm_compute::ITensorInfo::strides_in_bytes
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.

arm_compute::get_data_layout_dimension_index
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193

AsymmHelpers.h

arm_compute::Window::adjust
void adjust(size_t dimension, int adjust_value, bool is_at_start)
Adjust the start or end of a given dimension by the given value.
Definition: Window.inl:140

arm_compute::ICLKernel::add_1D_tensor_argument
void add_1D_tensor_argument(unsigned int &idx, const ICLTensor *tensor, const Window &window)
Add the passed 1D tensor&#39;s parameters to the object&#39;s kernel&#39;s arguments starting from the index idx...
Definition: ICLKernel.h:124

arm_compute::Window::first_slice_window_3D
Window first_slice_window_3D() const
First 3D slice of the window.
Definition: Window.h:291

arm_compute::DataType
DataType
Available data types.
Definition: Types.h:77

arm_compute::PadStrideInfo::pad_left
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:796

arm_compute::DataLayout
DataLayout
[DataLayout enum definition]
Definition: Types.h:120

conv_pad_left
const size_t conv_pad_left
Definition: NEDepthwiseConvolutionLayerNativeKernel.cpp:66

arm_compute::Window
Describe a multidimensional execution window.
Definition: Window.h:39

arm_compute::GPUTarget::G52

ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205

arm_compute::gpu_target_is_in
bool gpu_target_is_in(GPUTarget target_to_check, GPUTarget target, Args... targets)
Helper function to check whether a gpu target is equal to the provided targets.
Definition: GPUTarget.h:96

arm_compute::test::validation::reference::slice
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
Definition: SliceOperations.cpp:38

arm_compute::ITensorInfo::data_layout
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.

arm_compute::CLDirectConvolutionLayerKernel::configure
void configure(const ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info)
Set the input, weights, biases and output tensors.
Definition: CLDirectConvolutionLayerKernel.cpp:329

arm_compute::GPUTarget::G51