29 #if !defined(_WIN64) && !defined(__OpenBSD__)
36 template <
typename TInput,
typename TOutput,
typename OutputStage = Nothing>
39 template <
typename TInput,
typename TOutput>
46 uint64_t window_cells,
47 uint64_t n_valid_cells,
49 const TInput *
const *,
56 template <
typename TInput,
typename TOutput>
63 uint64_t window_cells,
64 uint64_t n_valid_cells,
66 const TInput *
const *,
74 template <
typename TInput,
typename TOutput,
typename OutputStage>
77 template <
typename TInput,
typename TOutput>
82 uint64_t window_cells,
83 uint64_t n_valid_cells,
85 const TInput *
const *inptrs,
90 kern(window_cells, n_valid_cells, n_channels, inptrs, outptr);
94 template <
typename TInput,
typename TOutput>
95 struct Invoker<TInput, TOutput, Requantize32>
99 uint64_t window_cells,
100 uint64_t n_valid_cells,
102 const TInput *
const *inptrs,
104 const Requantize32 &qp
107 kern(window_cells, n_valid_cells, n_channels, inptrs, outptr, qp);
111 template <
typename TInput,
typename TOutput,
typename OutputStage>
116 std::unique_ptr<const StratType> m_strat;
117 const unsigned int window_rows, window_cols;
121 : m_strat(strat), window_rows(
args.pool_window.
rows), window_cols(
args.pool_window.
cols)
133 template <
typename TInput,
typename TOutput=TInput,
typename OutputStage=Nothing>
136 const OutputStage m_os;
139 size_t get_working_size_per_thread()
const override {
return 0; }
140 void initialise_working_space(
void *)
const override { }
143 void compute_tile_padded(
144 unsigned int output_i,
unsigned int output_j,
145 unsigned int channel_start,
unsigned int channel_end,
152 const int start_i =
static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
153 const auto input_i =
static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
154 const auto pad_top =
static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
155 const int end_i = start_i + this->m_args.pool_window.rows;
156 const auto pad_bottom =
static_cast<unsigned int>((
unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
157 const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
159 const int start_j =
static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
160 const auto input_j =
static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
161 const auto pad_left =
static_cast<unsigned int>(start_j < 0 ? -start_j : 0);
162 const int end_j = start_j + this->m_args.pool_window.cols;
163 const auto pad_right =
static_cast<unsigned int>((
unsigned int) end_j < this->m_args.input_cols ? 0 : end_j - this->m_args.input_cols);
164 const auto valid_cols = this->m_args.pool_window.cols - (pad_left + pad_right);
167 const auto n_valid_cells = valid_rows * valid_cols;
168 auto inptrs =
reinterpret_cast<const TInput **
>(alloca(n_valid_cells *
sizeof(TInput *)));
170 auto my_ptr = inptrs;
171 auto row_ptr =
input.base + input_i*
input.ld_row + input_j*
input.ld_col + channel_start;
172 for (
auto i = valid_rows; i; i--)
175 row_ptr +=
input.ld_row;
177 for (
auto j = valid_cols; j; j--)
185 auto outptr = output.
base + output_i*output.
ld_row + output_j*output.
ld_col + channel_start;
190 const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
191 const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
192 const auto right_padded_width = this->m_args.input_cols + this->m_args.padding.right;
193 const auto captured_cols = std::min<int>(end_j, right_padded_width) - start_j;
194 const auto captured_cells = captured_rows * captured_cols;
195 const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
200 window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
205 void compute_row_padded_tile_row(
206 const unsigned int output_i,
unsigned int output_j,
unsigned int n_tile_cols,
207 const unsigned int channel_start,
const unsigned int channel_end,
214 const int start_i =
static_cast<int>(output_i * this->m_args.pool_stride.rows) - this->m_args.padding.top;
215 const auto input_i =
static_cast<unsigned int>(start_i < 0 ? 0 : start_i);
216 const auto pad_top =
static_cast<unsigned int>(start_i < 0 ? -start_i : 0);
217 const int end_i = start_i + this->m_args.pool_window.rows;
218 const auto pad_bottom =
static_cast<unsigned int>((
unsigned int) end_i < this->m_args.input_rows ? 0 : end_i - this->m_args.input_rows);
219 const auto valid_rows = this->m_args.pool_window.rows - (pad_top + pad_bottom);
221 const int start_j =
static_cast<int>(output_j * this->m_args.pool_stride.cols) - this->m_args.padding.left;
222 const auto input_j =
static_cast<unsigned int>(start_j < 0 ? 0 : start_j);
223 const auto valid_cols = this->m_args.pool_window.cols;
226 const auto n_valid_cells = valid_rows * valid_cols;
227 auto inptrs =
reinterpret_cast<const TInput **
>(alloca(n_valid_cells *
sizeof(TInput *)));
229 auto my_ptr = inptrs;
230 auto row_ptr =
input.base + input_i*
input.ld_row + input_j*
input.ld_col + channel_start;
231 for (
auto i = valid_rows; i; i--)
234 row_ptr +=
input.ld_row;
236 for (
auto j = valid_cols; j; j--)
244 auto outptr = output.
base + output_i*output.
ld_row + output_j*output.
ld_col + channel_start;
249 const auto bottom_padded_height = this->m_args.input_rows + this->m_args.padding.bottom;
250 const auto captured_rows = std::min<int>(end_i, bottom_padded_height) - start_i;
251 const auto captured_cells = captured_rows * valid_cols;
252 const auto window_cells = this->m_args.exclude_padding ? n_valid_cells : captured_cells;
254 for (; n_tile_cols; n_tile_cols--)
259 window_cells, n_valid_cells, channel_end - channel_start, inptrs, outptr, m_os
265 for (
auto n = 0u; n < n_valid_cells; n++)
267 inptrs[n] += this->m_args.pool_stride.cols *
input.ld_col;
275 const PoolingArgs &
args,
276 const OutputStage &os = {}