26 #include "pool_common.hpp" 34 template <
class strategy>
37 using TInput =
typename strategy::operand_type;
38 using TOutput =
typename strategy::return_type;
40 const PoolingArgs m_args;
42 constexpr
static unsigned int input_rows(
void)
44 return (strategy::out_rows() - 1)*strategy::stride_rows() + strategy::pool_rows();
47 constexpr
static unsigned int input_cols(
void)
49 return (strategy::out_cols() - 1)*strategy::stride_cols() + strategy::pool_cols();
52 size_t sizeof_input_buffer(
void)
const 54 return sizeof(TInput) * m_args.n_channels;
57 size_t sizeof_output_buffer(
void)
const 59 return sizeof(TOutput) * m_args.n_channels;
75 return sizeof_input_buffer() + sizeof_output_buffer();
79 const void *
const input,
81 void *
const working_space
84 const size_t ld_input_col = m_args.n_channels;
85 const size_t ld_input_row = ld_input_col * m_args.input_cols;
86 const size_t ld_input_batch = ld_input_row * m_args.input_rows;
87 const size_t ld_output_col = ld_input_col;
88 const size_t ld_output_row = ld_output_col * m_args.output_cols;
89 const size_t ld_output_batch = ld_output_row * m_args.output_rows;
92 input, ld_input_col, ld_input_row, ld_input_batch,
93 output, ld_output_col, ld_output_row, ld_output_batch,
99 const void *
const input,
102 size_t ld_input_batch,
104 size_t ld_output_col,
105 size_t ld_output_row,
106 size_t ld_output_batch,
107 void *
const working_space
111 m_args.n_batches, m_args.input_rows, m_args.input_cols,
113 input, ld_input_col, ld_input_row, ld_input_batch,
115 m_args.output_rows, m_args.output_cols,
116 output, ld_output_col, ld_output_row, ld_output_batch,
125 unsigned int channels,
126 const void *
const _input,
129 size_t ld_input_batch,
130 const PaddingValues &padding,
131 unsigned int output_height,
132 unsigned int output_width,
134 size_t ld_output_col,
135 size_t ld_output_row,
136 size_t ld_output_batch,
137 void *
const _working_space
140 strategy strat(m_args.cpu_info);
141 #ifdef CYCLE_PROFILING 142 arm_gemm::profiler prof;
143 #endif // CYCLE_PROFILING 146 const TInput *
const inptr =
static_cast<const TInput *
>(_input);
147 TOutput *
const outptr =
static_cast<TOutput *
>(_output);
150 uint8_t *
const working_space =
static_cast<uint8_t *
>(_working_space);
151 TOutput *
const output_buffer =
reinterpret_cast<TOutput *
>(working_space);
152 TInput *
const input_buffer =
reinterpret_cast<TInput *
>(working_space + sizeof_output_buffer());
155 const TInput pad_value = (m_args.pool_type == PoolingType::AVERAGE)
156 ? static_cast<TInput>(0)
157 : (std::numeric_limits<TInput>::has_infinity
158 ? -std::numeric_limits<TInput>::infinity()
160 for (
unsigned int i = 0; i < channels; i++)
162 input_buffer[i] = pad_value;
172 unsigned int output_i, output_j;
173 unsigned int output_height, output_width;
175 WorkItem(
unsigned int i,
unsigned int j,
unsigned int height,
unsigned int width)
176 : output_i(i), output_j(j), output_height(height), output_width(width) {}
179 auto execute = [&] (
const WorkItem &item) {
181 TOutput * _outptr_array[strategy::out_rows() * strategy::out_cols()];
182 TOutput **
const outptr_array = _outptr_array;
186 const auto output_pad_right = strategy::out_rows() - item.output_width;
187 auto outptr_element = outptr_array;
188 auto outptr_row = outptr + item.output_i * ld_output_row + item.output_j * ld_output_col;
191 for (
unsigned int i = 0; i < strategy::out_rows() * strategy::out_cols(); i++)
193 outptr_array[i] = output_buffer;
197 for (
unsigned int i = 0; i < item.output_height; i++)
199 auto outptr_col = outptr_row;
200 for (
unsigned int j = 0; j < item.output_width; j++)
202 *(outptr_element++) = outptr_col;
203 outptr_col += ld_output_col;
205 outptr_element += output_pad_right;
206 outptr_row += ld_output_row;
210 const int start_i = item.output_i * strategy::stride_rows() - padding.top;
211 const int end_i = start_i + input_rows();
212 const unsigned int pad_top = std::max(0, 0 - start_i);
213 const unsigned int pad_bottom = std::max(0, end_i - static_cast<int>(input_height));
215 const int start_j = item.output_j * strategy::stride_cols() - padding.left;
216 const int end_j = start_j + input_cols();
217 const unsigned int pad_left = std::max(0, 0 - start_j);
218 const unsigned int pad_right = std::max(0, end_j - static_cast<int>(input_width));
221 const TInput * _inptr_array[input_rows() * input_cols()];
222 const TInput **
const inptr_array = _inptr_array;
224 const unsigned int row_padding = pad_top + pad_bottom;
225 const unsigned int valid_rows = input_rows() - row_padding;
227 const unsigned int col_padding = pad_left + pad_right;
228 const unsigned int valid_cols = input_cols() - col_padding;
231 for (
unsigned int i = 0; i < input_rows() * input_cols(); i++)
233 inptr_array[i] = input_buffer;
237 auto inptr_row = inptr + std::max(start_i, 0) * ld_input_row + std::max(start_j, 0) * ld_input_col;
240 auto inptr_element = inptr_array + pad_top * input_cols() + pad_left;
241 for (
unsigned int i = 0; i < valid_rows; i++)
243 auto inptr_col = inptr_row;
244 for (
unsigned int j = 0; j < valid_cols; j++)
246 *(inptr_element++) = inptr_col;
247 inptr_col += ld_input_col;
250 inptr_row += ld_input_row;
251 inptr_element += col_padding;
256 #ifdef CYCLE_PROFILING 258 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(item.output_height * item.output_width * strategy::pool_rows() * strategy::pool_cols()));
259 #endif // CYCLE_PROFILING 260 strat.kernel(channels, inptr_array, outptr_array,
261 pad_left, pad_top, pad_right, pad_bottom);
265 std::stack<WorkItem, std::vector<WorkItem>> stack;
266 stack.push(WorkItem(0, 0, output_height, output_width));
267 while (!stack.empty())
272 const WorkItem item(stack.top());
275 if (item.output_height <= strategy::out_rows() &&
276 item.output_width <= strategy::out_cols())
284 if (item.output_height >= item.output_width)
286 const unsigned int height_in_tiles = (item.output_height + strategy::out_rows() - 1) / strategy::out_rows();
287 const unsigned int tiles_first = height_in_tiles - height_in_tiles / 2;
289 const unsigned int height_first = tiles_first * strategy::out_rows();
290 const unsigned int height_second = item.output_height - height_first;
292 stack.push(WorkItem(item.output_i + height_first, item.output_j, height_second, item.output_width));
293 stack.push(WorkItem(item.output_i, item.output_j, height_first, item.output_width));
297 const unsigned int width_in_tiles = item.output_width / strategy::out_cols();
298 const unsigned int tiles_first = width_in_tiles - width_in_tiles / 2;
300 const unsigned int width_first = tiles_first * strategy::out_cols();
301 const unsigned int width_second = item.output_width - width_first;
303 stack.push(WorkItem(item.output_i, item.output_j + width_first, item.output_height, width_second));
304 stack.push(WorkItem(item.output_i, item.output_j, item.output_height, width_first));
PoolingDepthfirstCacheOblivious & operator=(PoolingDepthfirstCacheOblivious &)=delete
template UniquePoolingCommon< float, float > pooling(const PoolingArgs &, const Nothing &)
void execute(unsigned int batches, unsigned int input_height, unsigned int input_width, unsigned int channels, const void *const _input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, const PaddingValues &padding, unsigned int output_height, unsigned int output_width, void *const _output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const _working_space) const override
size_t get_working_size(void) const override
void execute(const void *const input, size_t ld_input_col, size_t ld_input_row, size_t ld_input_batch, void *const output, size_t ld_output_col, size_t ld_output_row, size_t ld_output_batch, void *const working_space) const override
const size_t input_height
PoolingDepthfirstCacheOblivious(const PoolingArgs &args)
void execute(const void *const input, void *const output, void *const working_space) const override