ComputeLibrary/latest/depthwise_2depthfirst__driver_8hpp_source.xhtml

/*

 * Copyright (c) 2022-2024 Arm Limited.

 *

 * SPDX-License-Identifier: MIT

 *

 * Permission is hereby granted, free of charge, to any person obtaining a copy

 * of this software and associated documentation files (the "Software"), to

 * deal in the Software without restriction, including without limitation the

 * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or

 * sell copies of the Software, and to permit persons to whom the Software is

 * furnished to do so, subject to the following conditions:

 *

 * The above copyright notice and this permission notice shall be included in all

 * copies or substantial portions of the Software.

 *

 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR

 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,

 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE

 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER

 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,

 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE

 * SOFTWARE.

 */


#pragma once


#include "depthwise.hpp"

#include "utils.hpp"


namespace arm_conv {

namespace depthwise {


template <typename T> struct DefaultTAccum { using Type = T; };

template <> struct DefaultTAccum<int8_t> { using Type = int32_t; };

template <> struct DefaultTAccum<uint8_t> { using Type = int32_t; };


template <typename T> struct DefaultOutputStage { using Type = Nothing; };

template <> struct DefaultOutputStage<int8_t> { using Type = arm_gemm::Requantize32; };

template <> struct DefaultOutputStage<uint8_t> { using Type = arm_gemm::Requantize32; };


class IDepthfirstStrategy

{

  public:

  virtual ~IDepthfirstStrategy() = default;


  virtual unsigned int get_input_rows() const = 0;

  virtual unsigned int get_input_cols() const = 0;


  virtual unsigned int get_output_rows() const = 0;

  virtual unsigned int get_output_cols() const = 0;

};


template <typename T>

struct TensorSpec

{

  T base;

  size_t ld_row, ld_col;


  TensorSpec(T ptr, size_t ld_row, size_t ld_col)

  : base(ptr), ld_row(ld_row), ld_col(ld_col) {}

};


template <typename TInput, typename TWeight, typename TOutput>

class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>

{

  protected:

  using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;


  // The strategy which we're applying to solve the depthwise convolution.

  std::unique_ptr<const IDepthfirstStrategy> m_strat;


  /* Compute the amount of working space required for a single thread. */

  virtual size_t get_working_size_per_thread() const = 0;


  /* Initialise the working space for a thread. */

  virtual void initialise_working_space(void *) const = 0;


  /* Compute a portion of the output tensor with padding. */

  virtual void compute_tile_padded(

    const DepthwiseArgs &args,

    unsigned int output_i, unsigned int output_j,

    unsigned int output_channel_start, unsigned int output_channel_end,

    const TensorSpec<const TInput *> &input,

    const TensorSpec<TOutput *> &output,

    const void *parameters,

    void *working_space

  ) const = 0;


  /* Compute a portion of the work with only top/bottom padding.

   *

   * The default implementation of this repeatedly calls into the padded tile

   * variant.

   */

  virtual void compute_row_padded_tile_row(

    const DepthwiseArgs &args,

    const unsigned int output_i, unsigned int output_j, unsigned int n_tile_cols,

    const unsigned int output_channel_start, const unsigned int output_channel_end,

    const TensorSpec<const TInput *> &input,

    const TensorSpec<TOutput *> &output,

    const void *parameters,

    void *working_space

  ) const

  {

    for (; n_tile_cols; n_tile_cols--, output_j += m_strat->get_output_cols())

    {

      this->compute_tile_padded(

        args,

        output_i, output_j, output_channel_start, output_channel_end,

        input, output, parameters, working_space

      );

    }

  }


  /* Compute a portion of the output tensor with no padding.

   *

   * The default implementation of this repeatedly calls into the padded

   * variant.

   */

  virtual void compute_tiles_unpadded(

    const DepthwiseArgs &args,

    unsigned int start_output_i, unsigned int start_output_j,

    unsigned int n_tile_rows, unsigned int n_tile_cols,

    unsigned int output_channel_start, unsigned int output_channel_end,

    const TensorSpec<const TInput *> &input,

    const TensorSpec<TOutput *> &output,

    const void *parameters,

    void *working_space

  ) const

  {

    for (unsigned int tile_i = 0; tile_i < n_tile_rows; tile_i++)

    {

      unsigned int row_start_output_j = start_output_j;

      for (unsigned int tile_j = 0; tile_j < n_tile_cols; tile_j++)

      {

        this->compute_tile_padded(

            args,

            start_output_i, row_start_output_j,

            output_channel_start, output_channel_end,

            input, output, parameters, working_space

        );

        row_start_output_j += m_strat->get_output_cols();

      }

      start_output_i += m_strat->get_output_rows();

    }

  }


  void execute_internal(

    const DepthwiseArgs &args,

    const void *input,

    size_t ld_input_col,

    size_t ld_input_row,

    size_t ld_input_batch,

    const void *parameters,

    void *output,

    size_t ld_output_col,

    size_t ld_output_row,

    size_t ld_output_batch,

    void *working_space,

    unsigned int thread_id,

    unsigned int n_threads

  ) const override

  {

    // Get and initialise the working space for this thread.

    void *thread_working_space =

      static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();

    this->initialise_working_space(thread_working_space);


    // Construct convenient representations of the input/output tensors.

    TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);

    TensorSpec<TOutput *> output_tensor(reinterpret_cast<TOutput *>(output), ld_output_row, ld_output_col);


    const auto n_output_channels = args.input_channels * args.channel_multiplier;


    // By default we parallelize over the rows, but if there's only 1 row, we

    // try to parallize over batches

    auto thread_id_for_rows = thread_id;

    auto n_threads_for_rows = n_threads;

    auto thread_id_for_batches = 0;

    auto n_threads_for_batches = 1;

    if (args.output_rows == 1) {

      thread_id_for_rows = 0;

      n_threads_for_rows = 1;

      thread_id_for_batches = thread_id;

      n_threads_for_batches = n_threads;

    }


    // Progress the pointers for the first batch.

    input_tensor.base += ld_input_batch*thread_id_for_batches;

    output_tensor.base += ld_output_batch*thread_id_for_batches;

    for (unsigned int batch = thread_id_for_batches;

          batch < args.n_batches;

          batch += n_threads_for_batches)

    {

      // Iterate over rows of the output tensor; we stripe over the tiles.

      for (unsigned int start_output_i = thread_id_for_rows * m_strat->get_output_rows();

           start_output_i < args.output_rows;

           start_output_i += n_threads_for_rows * m_strat->get_output_rows())

      {

        // Determine what (if any padding) is required on the top/bottom of

        // this row of the convolution.

        const auto end_output_i = start_output_i + m_strat->get_output_rows();

        const bool pad_output_bottom = args.output_rows < end_output_i;


        const int start_input_i = start_output_i * args.stride_rows - args.padding.top;

        const bool pad_input_top = start_input_i < 0;

        const int end_input_i = start_input_i + m_strat->get_input_rows();

        const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;

        // We only need to account for input padding if direct padding is not supported.

        const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())

                || pad_output_bottom;


        // Iterate over the columns of the output tensor; we attempt to grab as

        // much as possible of the unpadded regions, so the loop structure is a

        // bit odd.

        unsigned int start_output_j = 0;

        while (start_output_j < args.output_cols)

        {

          const int start_in_j = start_output_j * args.stride_cols - args.padding.left;

          const bool pad_input_left = start_in_j < 0;


          // Determine if we can process a number of unpadded tiles in one go.

          int n_unpadded_tiles = 0;

          if ((!pad_input_left) || this->supports_direct_padding())

          {

            // Determine the maximum number of tiles we could handle.

            n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();


            // Handle padding on the right hand edge

            const int tile_stride = m_strat->get_output_cols() * args.stride_cols;

            int end_output_j = start_output_j + n_unpadded_tiles * m_strat->get_output_cols();

            int end_input_j = start_in_j + m_strat->get_input_cols() + (n_unpadded_tiles - 1)*tile_stride;


            while (n_unpadded_tiles > 0 &&

                   (static_cast<int>(args.output_cols) < end_output_j ||

                    static_cast<int>(args.input_cols) < end_input_j))

            {

              n_unpadded_tiles--;

              end_output_j -= m_strat->get_output_cols();

              end_input_j -= tile_stride;

            }

          }


          // Process unpadded tiles, if possible, otherwise process a padded tile.

          if (n_unpadded_tiles)

          {

            if (!pad_row)

            {

              // Completely unpadded execution

              this->compute_tiles_unpadded(

                args,

                start_output_i, start_output_j,

                1, n_unpadded_tiles,  // Compute a row of unpadded tiles

                0, n_output_channels,  // Compute all channels

                input_tensor, output_tensor, parameters, thread_working_space

              );

            }

            else

            {

              // Top/bottom padding only

              this->compute_row_padded_tile_row(

                args,

                start_output_i, start_output_j, n_unpadded_tiles,

                0, n_output_channels,  // Compute all channels

                input_tensor, output_tensor, parameters, thread_working_space

              );

            }

            start_output_j += n_unpadded_tiles * m_strat->get_output_cols();

          }

          else

          {

            this->compute_tile_padded(

              args,

              start_output_i, start_output_j,

              0, n_output_channels,  // Compute all channels

              input_tensor, output_tensor, parameters, thread_working_space

            );

            start_output_j += m_strat->get_output_cols();

          }

        }

      }


      // Progress the pointers for the next batch.

      input_tensor.base += ld_input_batch*n_threads_for_batches;

      output_tensor.base += ld_output_batch*n_threads_for_batches;

    }

  }


  public:

  DepthfirstDriver(IDepthfirstStrategy *strategy, const DepthwiseArgs &args)

  : Parent(args), m_strat(strategy)

  {

  }


  size_t get_working_size(unsigned int n_threads) const override final

  {

    return n_threads * this->get_working_size_per_thread();

  }


  virtual bool supports_direct_padding() const

  {

    return false;

  }

};


}  // namespace depthwise

}  // namespace arm_conv