37 template <
typename TInput,
typename TOutput>
40 unsigned int input_rows, input_cols, output_rows, output_cols;
44 unsigned int stride_rows,
unsigned int stride_cols,
45 unsigned int output_rows,
unsigned int output_cols)
46 : input_rows(output_rows + (window_rows - 1) * stride_rows),
47 input_cols(output_cols + (window_cols - 1) * stride_cols),
48 output_rows(output_rows), output_cols(output_cols)
58 unsigned int n_channels,
59 const TInput *
const *,
62 unsigned int pad_left,
64 unsigned int pad_right,
65 unsigned int pad_bottom
78 template <
typename TInput,
typename TOutput=TInput,
class OutputStage=Nothing>
81 size_t sizeof_input_buffer(
void)
const 83 return sizeof(TInput) * this->m_args.n_channels;
86 size_t sizeof_output_buffer(
void)
const 88 return sizeof(TOutput) * this->m_args.n_channels;
93 size_t get_working_size_per_thread(
unsigned int n_channels)
const override 95 return sizeof(
WorkingSpace) + n_channels * (
sizeof(TInput) +
sizeof(TOutput));
99 void initialise_working_space(
void *raw_ws,
unsigned int n_channels)
const override 103 ws->output_buffer =
reinterpret_cast<TInput *
>(ws + 1) + n_channels;
109 using limits = std::numeric_limits<TInput>;
110 if (limits::has_infinity)
112 fill_val = -limits::infinity();
116 fill_val = limits::min();
120 auto ptr =
reinterpret_cast<TInput *
>(ws->input_buffer);
121 for (; n_channels; n_channels--)
128 void compute_tile_padded(
129 unsigned int output_i,
unsigned int output_j,
130 unsigned int channel_start,
unsigned int channel_end,
140 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space);
141 auto inptr_array =
reinterpret_cast<const TInput **
>(alloca(
142 sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
143 auto outptr_array =
reinterpret_cast<TOutput **
>(alloca(
144 sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
147 const int ii =
static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
148 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
149 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
151 const unsigned int end_ii = ii + this->m_strat->get_input_rows();
152 const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
154 const int ij =
static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
155 const auto input_pad_left = static_cast<unsigned int>(ij < 0 ? -ij : 0);
156 const auto input_j =
static_cast<unsigned int>(ij < 0 ? 0 : ij);
158 const unsigned int end_ij = ij + this->m_strat->get_input_cols();
159 const auto input_pad_right = end_ij < this->m_args.input_cols ? 0 : end_ij - this->m_args.input_cols;
161 fill_pointer_array<const TInput>(
162 inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
165 reinterpret_cast<const TInput *
>(ws->input_buffer),
166 input_pad_top, this->m_args.input_rows - input_i,
167 input_pad_left, this->m_args.input_cols - input_j
172 outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
173 output.
base + output_i*output.
ld_row + output_j*output.
ld_col + channel_start,
175 reinterpret_cast<TOutput *
>(ws->output_buffer),
176 0, this->m_args.output_rows - output_i,
177 0, this->m_args.output_cols - output_j
183 this->m_args.exclude_padding,
184 input_pad_left, input_pad_top,
185 input_pad_right, input_pad_bottom
190 void compute_row_padded_tile_row(
191 const unsigned int output_i,
unsigned int output_j,
unsigned int n_tile_cols,
192 const unsigned int channel_start,
const unsigned int channel_end,
202 auto ws =
reinterpret_cast<WorkingSpace *
>(working_space);
203 auto inptr_array =
reinterpret_cast<const TInput **
>(alloca(
204 sizeof(TInput *) * this->m_strat->get_input_rows() * this->m_strat->get_input_cols()));
205 auto outptr_array =
reinterpret_cast<TOutput **
>(alloca(
206 sizeof(TOutput *) * this->m_strat->get_output_rows() * this->m_strat->get_output_cols()));
209 const int ii =
static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
210 const auto input_pad_top = static_cast<unsigned int>(ii < 0 ? -ii : 0);
211 const auto input_i =
static_cast<unsigned int>(ii < 0 ? 0 : ii);
213 const unsigned int end_ii = ii + this->m_strat->get_input_rows();
214 const auto input_pad_bottom = end_ii < this->m_args.input_rows ? 0 : end_ii - this->m_args.input_rows;
216 const int ij =
static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
217 const auto input_j = static_cast<unsigned int>(ij < 0 ? 0 : ij);
219 const auto end_oi = output_i + this->m_strat->get_output_cols();
220 const auto output_pad_bottom = end_oi < this->m_args.output_rows ? 0 : end_oi - this->m_args.output_rows;
222 fill_pointer_array<const TInput>(
223 inptr_array, this->m_strat->get_input_rows(), this->m_strat->get_input_cols(),
226 reinterpret_cast<const TInput *
>(ws->input_buffer),
227 input_pad_top, this->m_args.input_rows - input_i,
228 0, this->m_args.input_cols - input_j
233 outptr_array, this->m_strat->get_output_rows(), this->m_strat->get_output_cols(),
234 output.
base + output_i*output.
ld_row + output_j*output.
ld_col + channel_start,
236 reinterpret_cast<TOutput *
>(ws->output_buffer),
237 0, this->m_args.output_rows - output_i,
238 0, this->m_args.output_cols - output_j
242 for (; n_tile_cols; n_tile_cols--)
246 this->m_args.exclude_padding,
252 const auto input_col_stride = input.
ld_col * this->m_strat->get_output_cols() * this->m_args.pool_stride.cols;
254 auto n = input_pad_top * this->m_strat->get_input_cols();
255 n < (this->m_strat->get_input_rows() - input_pad_bottom) * this->m_strat->get_input_cols();
259 inptr_array[
n] += input_col_stride;
262 const auto output_col_stride = output.
ld_col * this->m_strat->get_output_cols();
265 n < (this->m_strat->get_output_rows() - output_pad_bottom) * this->m_strat->get_output_cols();
276 const PoolingArgs &
args,
const OutputStage &os = {})
unsigned int get_input_cols() const override
template UniquePoolingCommon< float, float > pooling(const PoolingArgs &, const Nothing &)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
DepthfirstStrategy(unsigned int window_rows, unsigned int window_cols, unsigned int stride_rows, unsigned int stride_cols, unsigned int output_rows, unsigned int output_cols)
unsigned int get_output_rows() const override
unsigned int get_output_cols() const override
unsigned int get_input_rows() const override
void fill_pointer_array(size_t element_size, void **dest_raw, const unsigned int array_rows, const unsigned int array_cols, void *base_ptr_raw, size_t ld_row, size_t ld_col, void *pad_buffer_raw, const unsigned int pad_top, const unsigned int valid_rows, const unsigned int pad_left, const unsigned int valid_cols)
virtual KernelType get_kernel(void) const =0
PoolingDepthfirst(const DepthfirstStrategy< TInput, TOutput > *strat, const PoolingArgs &args, const OutputStage &os={})
void(* KernelType)(unsigned int n_channels, const TInput *const *, TOutput *const *, bool exclude_padding, unsigned int pad_left, unsigned int pad_top, unsigned int pad_right, unsigned int pad_bottom)