Compute Library
 22.11
PoolingDepthfirstGenericQuantized< strategy > Class Template Reference

#include <pooling_depthfirst_generic_quantized.hpp>

Collaboration diagram for PoolingDepthfirstGenericQuantized< strategy >:
[legend]

Public Member Functions

 PoolingDepthfirstGenericQuantized (const PoolingArgs &args, const Requantize32 &rq)
 
 PoolingDepthfirstGenericQuantized (PoolingDepthfirstGenericQuantized &)=delete
 
PoolingDepthfirstGenericQuantizedoperator= (PoolingDepthfirstGenericQuantized &)=delete
 
size_t sizeof_input_pointer_array (void) const
 
size_t get_working_size (unsigned int num_threads) const override
 
void execute (const void *const input, void *const output, void *const working_space, unsigned int thread_id, unsigned int num_threads) const override
 
void execute (const void *const input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, void *const output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const working_space, unsigned int thread_id, unsigned int num_threads) const override
 
void execute (unsigned int batches, unsigned int height, unsigned int width, unsigned int channels, const void *const _input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const PaddingValues &padding, unsigned int output_height, unsigned int output_width, void *const _output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const _working_space, unsigned int thread_id, unsigned int num_threads) const override
 

Detailed Description

template<class strategy>
class arm_conv::pooling::PoolingDepthfirstGenericQuantized< strategy >

Definition at line 34 of file pooling_depthfirst_generic_quantized.hpp.

Constructor & Destructor Documentation

◆ PoolingDepthfirstGenericQuantized() [1/2]

PoolingDepthfirstGenericQuantized ( const PoolingArgs &  args,
const Requantize32 &  rq 
)
inline

Definition at line 53 of file pooling_depthfirst_generic_quantized.hpp.

References PoolingDepthfirstGenericQuantized< strategy >::operator=().

53  : m_args(args), m_requant(rq)
54  {
55  }

◆ PoolingDepthfirstGenericQuantized() [2/2]

Member Function Documentation

◆ execute() [1/3]

void execute ( const void *const  input,
void *const  output,
void *const  working_space,
unsigned int  thread_id,
unsigned int  num_threads 
) const
inlineoverride

Definition at line 70 of file pooling_depthfirst_generic_quantized.hpp.

Referenced by PoolingDepthfirstGenericQuantized< strategy >::execute().

77  {
78  const size_t ld_input_col = m_args.n_channels;
79  const size_t ld_input_row = ld_input_col * m_args.input_cols;
80  const size_t ld_input_batch = ld_input_row * m_args.input_rows;
81  const size_t ld_output_col = ld_input_col;
82  const size_t ld_output_row = ld_output_col * m_args.output_cols;
83  const size_t ld_output_batch = ld_output_row * m_args.output_rows;
84 
85  execute(
86  input, ld_input_col, ld_input_row, ld_input_batch,
87  output, ld_output_col, ld_output_row, ld_output_batch,
88  working_space,
89  thread_id, num_threads
90  );
91  }
void execute(const void *const input, void *const output, void *const working_space, unsigned int thread_id, unsigned int num_threads) const override

◆ execute() [2/3]

void execute ( const void *const  input,
size_t  ld_input_col,
size_t  ld_input_row,
size_t  ld_input_batch,
void *const  output,
size_t  ld_output_col,
size_t  ld_output_row,
size_t  ld_output_batch,
void *const  working_space,
unsigned int  thread_id,
unsigned int  num_threads 
) const
inlineoverride

Definition at line 93 of file pooling_depthfirst_generic_quantized.hpp.

References PoolingDepthfirstGenericQuantized< strategy >::execute().

106  {
107  execute(
108  m_args.n_batches, m_args.input_rows, m_args.input_cols,
109  m_args.n_channels,
110  input, ld_input_col, ld_input_row, ld_input_batch,
111  m_args.padding,
112  m_args.output_rows, m_args.output_cols,
113  output, ld_output_col, ld_output_row, ld_output_batch,
114  working_space,
115  thread_id, num_threads
116  );
117  }
void execute(const void *const input, void *const output, void *const working_space, unsigned int thread_id, unsigned int num_threads) const override

◆ execute() [3/3]

void execute ( unsigned int  batches,
unsigned int  height,
unsigned int  width,
unsigned int  channels,
const void *const  _input,
size_t  ld_input_col,
size_t  ld_input_row,
size_t  ld_input_batch,
const PaddingValues &  padding,
unsigned int  output_height,
unsigned int  output_width,
void *const  _output,
size_t  ld_output_col,
size_t  ld_output_row,
size_t  ld_output_batch,
void *const  _working_space,
unsigned int  thread_id,
unsigned int  num_threads 
) const
inlineoverride

Definition at line 119 of file pooling_depthfirst_generic_quantized.hpp.

References arm_compute::test::validation::batch, batches, arm_gemm::roundup(), PoolingDepthfirstGenericQuantized< strategy >::sizeof_input_pointer_array(), and strategy.

139  {
140  strategy strat(m_args.cpu_info);
141 #ifdef CYCLE_PROFILING
142  arm_gemm::profiler prof;
143 #endif // CYCLE_PROFILING
144 
145  const unsigned int roundup_output_rows = roundup(output_height, num_threads);
146  const unsigned int rows_per_thread = roundup_output_rows / num_threads;
147  int start_out_height = static_cast<int>(thread_id * rows_per_thread);
148  int end_out_height = std::min<int>(output_height, static_cast<int>((thread_id + 1) * rows_per_thread));
149 
150  unsigned int start_channel = 0;
151  unsigned int end_channel = channels;
152  if(output_height == 1)
153  {
154  const unsigned int channels_per_thread = roundup(channels, num_threads) / num_threads;
155  start_channel = thread_id * channels_per_thread;
156  end_channel = std::min(start_channel + channels_per_thread, channels);
157 
158  // Reset start and end rows
159  start_out_height = 0;
160  end_out_height = output_height;
161  }
162 
163  if(start_channel >= end_channel)
164  {
165  // Early exit in case of multiple threads parallelising on channels
166  return;
167  }
168 
169  // Cast input and output pointers into the right types
170  const TInput *const inptr = static_cast<const TInput *>(_input) + start_channel;
171  TOutput *const outptr = static_cast<TOutput *>(_output) + start_channel;
172 
173  // Grab the input pointer array
174  uint8_t *const working_space = static_cast<uint8_t *>(_working_space);
175  const TInput **const inptr_array = reinterpret_cast<const TInput **>(working_space + thread_id * sizeof_input_pointer_array());
176 
177  // For each output tile, construct the requisite set of pointers and call
178  // into the kernel.
179  for (unsigned int batch = 0; batch < batches; batch++)
180  {
181  // Get batch pointers
182  const auto inptr_batch = inptr + batch * ld_input_batch;
183  const auto outptr_batch = outptr + batch * ld_output_batch;
184 
185  for (int out_i = start_out_height; out_i < end_out_height; out_i++)
186  {
187  const int start_in_i = out_i * m_args.pool_stride.rows - padding.top;
188  const int end_in_i = start_in_i + m_args.pool_window.rows;
189 
190  // Compute top/bottom padding
191  const auto pad_top = static_cast<unsigned int>(-std::min(start_in_i, 0));
192  const auto pad_bottom = static_cast<unsigned int>(-std::min(static_cast<int>(height) - end_in_i, 0));
193 
194  // Compute the number of pooling window rows which are contained in
195  // either the valid region of the input tensor, or the padding.
196  const auto padded_bottom = std::min<unsigned int>(
197  start_in_i + m_args.pool_window.rows, height + padding.bottom
198  );
199  const auto n_total_rows = padded_bottom - start_in_i;
200 
201  for (int out_j = 0, start_in_j = -padding.left;
202  out_j < static_cast<int>(output_width);
203  out_j++, start_in_j += m_args.pool_stride.cols)
204  {
205  const int end_in_j = start_in_j + m_args.pool_window.cols;
206 
207  // Compute left/right padding
208  const auto pad_left = static_cast<unsigned int>(-std::min(start_in_j, 0));
209  const auto pad_right = static_cast<unsigned int>(-std::min(static_cast<int>(width) - end_in_j, 0));
210 
211  // Compute the number of pooling window columns which are contained
212  // in either the valid region of the input tensor, or the padding.
213  const auto padded_right = std::min<unsigned int>(
214  start_in_j + m_args.pool_window.cols, width + padding.right
215  );
216  const auto n_total_cols = padded_right - start_in_j;
217 
218  // Construct the input pointer array - fill in all valid points
219  // contiguously.
220  const TInput **ptrs = inptr_array;
221  for (auto i = pad_top; i < input_rows() - pad_bottom; i++)
222  {
223  // Can skip over the left padding because we will have either the
224  // same or less than the previous tile.
225  unsigned int j = pad_left;
226  const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
227  for (; j < input_cols() - pad_right; j++)
228  {
229  *(ptrs++) = colptr;
230  colptr += ld_input_col;
231  }
232  }
233 
234  // Compute the number of valid cells
235  const auto valid_rows = input_rows() - pad_top - pad_bottom;
236  const auto valid_cols = input_cols() - pad_left - pad_right;
237  const auto valid_cells = valid_rows * valid_cols;
238  const auto cells_in_range = n_total_rows * n_total_cols;
239  const auto window_cells = m_args.exclude_padding ? valid_cells : cells_in_range;
240 
241  // Get the output pointer for this call
242  TOutput *outptr = outptr_batch + out_i * ld_output_row + out_j * ld_output_col;
243 
244 #ifdef CYCLE_PROFILING
245  // TODO Work number
246  auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long) 0);
247 #endif
248  strat.kernel(window_cells, valid_cells, end_channel - start_channel, inptr_array, outptr, m_requant);
249  }
250  }
251  }
252  }
T roundup(const T a, const T b)
Definition: utils.hpp:70
const StratType * strategy
unsigned int batches

◆ get_working_size()

size_t get_working_size ( unsigned int  num_threads) const
inlineoverride

◆ operator=()

◆ sizeof_input_pointer_array()

size_t sizeof_input_pointer_array ( void  ) const
inline

Definition at line 60 of file pooling_depthfirst_generic_quantized.hpp.

Referenced by PoolingDepthfirstGenericQuantized< strategy >::execute(), and PoolingDepthfirstGenericQuantized< strategy >::get_working_size().

61  {
62  return sizeof(TInput *) * input_rows() * input_cols();
63  }

The documentation for this class was generated from the following file: