58 #if defined(__aarch64__)
59 const bool requantize =
src->quantization_info() !=
dst->quantization_info();
61 switch (
src->data_type())
66 create_arm_pooling_requant<uint8_t, uint8_t>(
src,
dst,
info, cpu_info);
70 create_arm_pooling<uint8_t, uint8_t>(
src,
dst,
info, cpu_info);
76 create_arm_pooling_requant<int8_t, int8_t>(
src,
dst,
info, cpu_info);
80 create_arm_pooling<int8_t, int8_t>(
src,
dst,
info, cpu_info);
83 #if defined(ENABLE_FP16_KERNELS)
85 create_arm_pooling<float16_t, float16_t>(
src,
dst,
info, cpu_info);
87 #endif // defined(ENABLE_FP16_KERNELS)
89 create_arm_pooling<float, float>(
src,
dst,
info, cpu_info);
94 #endif // defined(__aarch64__)
97 INEKernel::configure(win);
111 "Only NHWC is supported by assembly kernels");
113 "Only AVG and MAX pooling are supported by assembly kernels");
117 "Pooling region that is entirely outside input tensor is unsupported by assembly kernels");
119 if (
dst->total_size() > 0)
125 const auto src_qinfo =
src->quantization_info().uniform();
126 const auto dst_qinfo =
dst->quantization_info().uniform();
128 if (src_qinfo != dst_qinfo)
130 const float multiplier = src_qinfo.scale / dst_qinfo.scale;
131 int32_t dst_multiplier{};
140 const bool has_padding =
info.pad_stride_info.has_padding();
142 !
info.exclude_padding && has_padding,
143 "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
152 const bool has_padding =
info.pad_stride_info.has_padding();
154 !
info.exclude_padding && has_padding,
155 "Assembly kernels do not support padding for QASYMM8 with same src/dst quantization info");
174 const auto in_ptr =
src->buffer() +
src->info()->offset_first_element_in_bytes();
175 auto out_ptr =
dst->buffer() +
dst->info()->offset_first_element_in_bytes();
179 const auto src_shape =
src->info()->tensor_shape();
181 const auto src_padding =
src->info()->padding();
182 const auto dst_padding =
dst->info()->padding();
184 const size_t ld_src_col = src_shape[0] + src_padding.left + src_padding.right;
185 const size_t ld_src_row = ld_src_col * (src_shape[1] + src_padding.top + src_padding.bottom);
186 const size_t ld_src_batch = ld_src_row * src_shape[2];
187 const size_t ld_dst_col =
dst_shape[0] + dst_padding.left + dst_padding.right;
188 const size_t ld_dst_row = ld_dst_col * (
dst_shape[1] + dst_padding.top + dst_padding.bottom);
189 const size_t ld_dst_batch = ld_dst_row *
dst_shape[2];
191 _kernel_asm->execute(in_ptr, ld_src_col, ld_src_row, ld_src_batch, out_ptr, ld_dst_col, ld_dst_row, ld_dst_batch,
192 working_space,
info.thread_id,
info.num_threads);
197 return _kernel_asm->get_working_size(num_threads);
202 return _kernel_asm !=
nullptr;
205 template <
typename Typesrc,
typename Typedst>
206 void CpuPool2dAssemblyWrapperKernel::create_arm_pooling(
const ITensorInfo *
src,
212 ? arm_conv::pooling::PoolingType::AVERAGE
215 arm_conv::pooling::PoolingWindow window{};
216 window.cols =
static_cast<unsigned int>(
info.pool_size.x());
217 window.rows =
static_cast<unsigned int>(
info.pool_size.y());
219 arm_conv::pooling::PoolingStride stride{};
220 std::tie(stride.cols, stride.rows) =
info.pad_stride_info.stride();
222 const arm_conv::pooling::PaddingValues padding{
info.pad_stride_info.pad_left(),
info.pad_stride_info.pad_top(),
223 info.pad_stride_info.pad_right(),
info.pad_stride_info.pad_bottom()};
227 constexpr
unsigned int idx_channels = 0;
228 constexpr
unsigned int idx_batches = 3;
230 const unsigned int n_batches =
src->dimension(idx_batches);
232 const unsigned int src_cols =
src->dimension(
idx_width);
233 const unsigned int n_channels =
src->dimension(idx_channels);
235 const unsigned int dst_cols =
dst->dimension(
idx_width);
237 arm_conv::pooling::PoolingArgs
args(&cpu_info, pool_type, window, stride,
info.exclude_padding, n_batches, src_rows,
238 src_cols, n_channels, dst_rows, dst_cols, padding,
nullptr);
241 auto pooling_kernel_asm = arm_conv::pooling::pooling<Typesrc, Typedst>(
args);
242 if (pooling_kernel_asm ==
nullptr)
248 _kernel_asm = std::move(pooling_kernel_asm);
251 template <
typename Typesrc,
typename Typedst>
252 void CpuPool2dAssemblyWrapperKernel::create_arm_pooling_requant(
const ITensorInfo *
src,
254 const PoolingLayerInfo &
info,
258 ? arm_conv::pooling::PoolingType::AVERAGE
261 arm_conv::pooling::PoolingWindow window{};
262 window.cols =
static_cast<unsigned int>(
info.pool_size.x());
263 window.rows =
static_cast<unsigned int>(
info.pool_size.y());
265 arm_conv::pooling::PoolingStride stride{};
266 std::tie(stride.cols, stride.rows) =
info.pad_stride_info.stride();
268 const arm_conv::pooling::PaddingValues padding{
info.pad_stride_info.pad_left(),
info.pad_stride_info.pad_top(),
269 info.pad_stride_info.pad_right(),
info.pad_stride_info.pad_bottom()};
273 constexpr
unsigned int idx_channels = 0;
274 constexpr
unsigned int idx_batches = 3;
276 const unsigned int n_batches =
src->dimension(idx_batches);
278 const unsigned int src_cols =
src->dimension(
idx_width);
279 const unsigned int n_channels =
src->dimension(idx_channels);
281 const unsigned int dst_cols =
dst->dimension(
idx_width);
283 arm_conv::pooling::PoolingArgs
args(&cpu_info, pool_type, window, stride,
info.exclude_padding, n_batches, src_rows,
284 src_cols, n_channels, dst_rows, dst_cols, padding,
nullptr);
286 const auto src_qinfo =
src->quantization_info().uniform();
287 const auto dst_qinfo =
dst->quantization_info().uniform();
289 const float multiplier = src_qinfo.scale / dst_qinfo.scale;
290 int32_t dst_multiplier{};
294 const arm_conv::pooling::Requantize32 requant_args(src_qinfo.offset, dst_qinfo.offset,
300 auto pooling_kernel_asm =
301 arm_conv::pooling::pooling<Typesrc, Typedst, arm_conv::pooling::Requantize32>(
args, requant_args);
302 if (pooling_kernel_asm ==
nullptr)
308 _kernel_asm = std::move(pooling_kernel_asm);
311 size_t CpuPool2dAssemblyWrapperKernel::get_mws(
const CPUInfo &platform,
size_t thread_count)
const