32 #ifdef ENABLE_NCHW_KERNELS 37 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 38 void pooling3_fp16_neon_nchw(
const ITensor *
src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
44 Iterator in(src, window_src);
45 Iterator out(dst0, window);
48 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
49 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
50 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
51 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
53 int pool_stride_y = 0;
54 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
55 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
56 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
58 const unsigned char *
const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
59 const unsigned char *
const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
60 const unsigned char *
const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
64 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
65 float16x4_t middle_data = vld1_f16(reinterpret_cast<const float16_t *>(src_middle_ptr + in.offset()));
66 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
72 top_data =
vmul_f16(top_data, top_data);
73 middle_data =
vmul_f16(middle_data, middle_data);
74 bottom_data =
vmul_f16(bottom_data, bottom_data);
80 const float scale =
calculate_avg_scale(pool_info.exclude_padding,
DataLayout::NCHW,
id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
82 const float16x4_t scale_v = vdup_n_f16(scale);
84 const float16x4_t sum_data =
vadd_f16(
vadd_f16(top_data, bottom_data), middle_data);
85 res =
vpadd_f16(vset_lane_f16(0.f, sum_data, 3), sum_data);
90 const float16x4_t max_data =
vmax_f16(
vmax_f16(top_data, bottom_data), middle_data);
91 res =
vpmax_f16(vset_lane_f16(-std::numeric_limits<float>::max(), max_data, 3), max_data);
98 res = vinv_f16(vinvsqrt_f16(res));
101 *(
reinterpret_cast<float16_t *
>(out.ptr())) = vget_lane_f16(res, 0);
106 template <
typename T>
107 inline typename std::enable_if<std::is_same<T, float16_t>::value, float32x2_t>
::type 108 f16_to_f32(float16x4_t in)
110 float32x2_t out = {
static_cast<float>(vget_lane_f16(in, 0)), static_cast<float>(vget_lane_f16(in, 1)) };
115 template <
typename T>
116 inline typename std::enable_if<std::is_same<T, float>::value, float32x2_t>
::type 117 f16_to_f32(float32x2_t in)
122 template <
typename T>
123 void pooling2_nchw_maxpool_indices(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
125 Iterator in(src, window_src);
126 Iterator out(dst0, window);
127 Iterator indices(dst1, window);
128 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
129 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
130 int pool_stride_x = 0;
131 int pool_stride_y = 0;
132 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
133 const uint8_t *
const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
134 const uint8_t *
const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
135 const int pad_left = src->info()->padding().left;
136 const int pad_right = src->info()->padding().right;
137 const int in_stride_y =
static_cast<int>(src->info()->strides_in_bytes().y());
141 auto top_data =
wrapper::vload(reinterpret_cast<const T *>(src_top_ptr + in.offset()));
142 auto bottom_data =
wrapper::vload(reinterpret_cast<const T *>(src_bottom_ptr + in.offset()));
143 float32x2_t top_data_f32 = f16_to_f32<T>(top_data);
144 float32x2_t bottom_data_f32 = f16_to_f32<T>(bottom_data);
147 const float32x2_t max_data_top = vpmax_f32(top_data_f32, top_data_f32);
148 const float32x2_t max_data_bottom = vpmax_f32(bottom_data_f32, bottom_data_f32);
149 const float32x2_t max_data = vmax_f32(max_data_top, max_data_bottom);
150 *(
reinterpret_cast<T *
>(out.ptr())) =
static_cast<T
>(vget_lane_f32(max_data, 0));
154 const uint32_t offset_top = (uint32_t)(offset_base /
sizeof(T));
155 const uint32_t offset_bottom = offset_top + in_stride_y /
sizeof(T) - pad_right - pad_left;
156 const uint32x2_t voffset_top = { offset_top, offset_top + 1u };
157 const uint32x2_t voffset_bottom = { offset_bottom, offset_bottom + 1u };
158 const uint32x2_t tmp_indices_top = vbsl_u32(vcge_f32(top_data_f32, vrev64_f32(top_data_f32)), voffset_top, vrev64_u32(voffset_top));
159 const uint32x2_t tmp_indices_bottom = vbsl_u32(vcge_f32(bottom_data_f32, vrev64_f32(bottom_data_f32)), voffset_bottom, vrev64_u32(voffset_bottom));
160 *(
reinterpret_cast<int *
>(indices.ptr())) = vget_lane_u32(vbsl_u32(vcge_f32(max_data_top, max_data_bottom), tmp_indices_top, tmp_indices_bottom), 0);
165 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 166 void pooling2_fp16_neon_nchw(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
170 pooling2_nchw_maxpool_indices<float16_t>(
src, dst0, dst1, pool_info, window_src, window);
174 Iterator in(src, window_src);
175 Iterator out(dst0, window);
176 constexpr
int pool_size = 2;
177 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
178 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
179 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
180 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
182 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
183 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
184 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
186 const unsigned char *
const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
187 const unsigned char *
const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
191 float16x4_t top_data = vld1_f16(reinterpret_cast<const float16_t *>(src_top_ptr + in.offset()));
192 float16x4_t bottom_data = vld1_f16(reinterpret_cast<const float16_t *>(src_bottom_ptr + in.offset()));
193 float16x4_t res = {};
198 top_data =
vmul_f16(top_data, top_data);
199 bottom_data =
vmul_f16(bottom_data, bottom_data);
204 const float scale =
calculate_avg_scale(pool_info.exclude_padding,
DataLayout::NCHW,
id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
206 const float16x4_t scale_v = vdup_n_f16(scale);
208 const float16x4_t sum_data =
vadd_f16(top_data, bottom_data);
213 const float16x4_t max_data =
vmax_f16(top_data, bottom_data);
220 res = vinv_f16(vinvsqrt_f16(res));
224 *(
reinterpret_cast<float16_t *
>(out.ptr())) = vget_lane_f16(res, 0);
230 void poolingMxN_fp16_neon_nchw(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
233 Iterator in(src, window_src);
234 Iterator out(dst0, window);
236 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
237 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
238 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
239 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
240 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
241 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
242 int pool_stride_x = 0;
243 int pool_stride_y = 0;
244 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
245 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
246 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
250 float16_t res = 0.0f;
251 float16x8_t vres = vdupq_n_f16(0.0f);
256 const float scale =
calculate_avg_scale(pool_info.exclude_padding,
DataLayout::NCHW,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
261 for(
int y = 0; y < pool_size_y; ++y)
264 for(; x <= (pool_size_x - 8); x += 8)
266 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) *
static_cast<int> 267 (src->info()->strides_in_bytes().y())));
281 for(; x < pool_size_x; ++x)
283 float16_t data = *(
reinterpret_cast<const float16_t *
>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
284 + (y - pool_pad_top) *
static_cast<int>(src->info()->strides_in_bytes().y())));
297 float16x4_t tmp =
vpadd_f16(vget_high_f16(vres), vget_low_f16(vres));
298 res += vget_lane_f16(tmp, 0);
299 res += vget_lane_f16(tmp, 1);
300 res += vget_lane_f16(tmp, 2);
301 res += vget_lane_f16(tmp, 3);
311 for(
int y = 0; y < pool_size_y; ++y)
314 for(; x <= (pool_size_x - 8); x += 8)
316 const float16x8_t data = vld1q_f16(reinterpret_cast<const float16_t *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) *
static_cast<int> 317 (src->info()->strides_in_bytes().y())));
322 for(; x < pool_size_x; ++x)
324 const float16_t data = *(
reinterpret_cast<const float16_t *
>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x())
325 + (y - pool_pad_top) *
static_cast<int>(src->info()->strides_in_bytes().y())));
326 res = std::max(res, data);
330 float16x4_t tmp =
vpmax_f16(vget_high_f16(vres), vget_low_f16(vres));
331 res = std::max(res, vget_lane_f16(tmp, 0));
332 res = std::max(res, vget_lane_f16(tmp, 1));
333 res = std::max(res, vget_lane_f16(tmp, 2));
334 res = std::max(res, vget_lane_f16(tmp, 3));
340 res = std::sqrt(res);
344 *(
reinterpret_cast<float16_t *
>(out.ptr())) = res;
350 void poolingMxN_fp32_neon_nchw(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
353 Iterator in(src, window_src);
354 Iterator out(dst0, window);
356 const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().x() : pool_info.pool_size.width;
357 const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.height;
358 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
359 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
360 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
361 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
362 int pool_stride_x = 0;
363 int pool_stride_y = 0;
364 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
365 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
366 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
375 const float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
379 float32x4_t vres = vdupq_n_f32(0.0f);
381 for(int y = 0; y < pool_size_y; ++y)
384 for(; x <= (pool_size_x - 4); x += 4)
386 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
387 (src->info()->strides_in_bytes().y())));
390 if(pool_info.pool_type == PoolingType::L2)
392 vres = vmlaq_f32(vres, data, data);
396 vres = vaddq_f32(vres, data);
401 for(; x < pool_size_x; ++x)
403 float data = *(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) * static_cast<int>
404 (src->info()->strides_in_bytes().y())));
407 if(pool_info.pool_type == PoolingType::L2)
416 #
if defined(__aarch64__)
418 res += vaddvq_f32(vres);
421 float32x2_t tmp = vpadd_f32(vget_high_f32(vres), vget_low_f32(vres));
422 tmp = vpadd_f32(tmp, tmp);
424 res += vget_lane_f32(tmp, 0);
425 #endif // __aarch64__ 434 for(
int y = 0; y < pool_size_y; ++y)
437 for(; x <= (pool_size_x - 4); x += 4)
439 const float32x4_t data = vld1q_f32(reinterpret_cast<const float *>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) *
static_cast<int> 440 (src->info()->strides_in_bytes().y())));
441 vres = vmaxq_f32(vres, data);
445 for(; x < pool_size_x; ++x)
447 const float data = *(
reinterpret_cast<const float *
>(in.ptr() + (x - pool_pad_left) * static_cast<int>(src->info()->strides_in_bytes().x()) + (y - pool_pad_top) *
static_cast<int> 448 (src->info()->strides_in_bytes().y())));
449 res = std::max(res, data);
452 #if defined(__aarch64__) 454 res = std::max(vmaxvq_f32(vres), res);
456 float32x2_t tmp = vpmax_f32(vget_high_f32(vres), vget_low_f32(vres));
457 tmp = vpmax_f32(tmp, tmp);
459 res = std::max(res, vget_lane_f32(tmp, 0));
460 #endif // __aarch64__ 466 res = std::sqrt(res);
470 *(
reinterpret_cast<float *
>(out.ptr())) = res;
475 void pooling2_fp32_neon_nchw(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
479 pooling2_nchw_maxpool_indices<float>(
src, dst0, dst1, pool_info, window_src, window);
483 Iterator in(src, window_src);
484 Iterator out(dst0, window);
485 constexpr
int pool_size = 2;
486 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
487 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
488 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
489 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
490 int pool_stride_x = 0;
491 int pool_stride_y = 0;
492 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
493 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
494 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
496 const uint8_t *
const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
497 const uint8_t *
const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
501 const auto in_top_ptr =
reinterpret_cast<const float *
>(src_top_ptr + in.offset());
502 const auto in_bottom_ptr =
reinterpret_cast<const float *
>(src_bottom_ptr + in.offset());
503 float32x2_t top_data = vld1_f32(in_top_ptr);
504 float32x2_t bottom_data = vld1_f32(in_bottom_ptr);
505 float32x2_t res = {};
510 top_data = vmul_f32(top_data, top_data);
511 bottom_data = vmul_f32(bottom_data, bottom_data);
517 float scale =
calculate_avg_scale(pool_info.exclude_padding,
DataLayout::NCHW,
id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
519 const float32x2_t scale_v = vdup_n_f32(scale);
522 const float32x2_t sum_data = vadd_f32(top_data, bottom_data);
523 res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v);
527 const float32x2_t max_data = vmax_f32(top_data, bottom_data);
528 res = vpmax_f32(max_data, max_data);
530 final_res = vget_lane_f32(res, 0);
535 final_res = sqrt(final_res);
539 *(
reinterpret_cast<float *
>(out.ptr())) = final_res;
545 void pooling3_fp32_neon_nchw(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
548 Iterator in(src, window_src);
549 Iterator out(dst0, window);
551 constexpr
const int pool_size = 3;
552 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
553 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
554 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
555 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
556 int pool_stride_x = 0;
557 int pool_stride_y = 0;
558 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
559 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
560 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
562 const uint8_t *
const src_top_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top)));
563 const uint8_t *
const src_middle_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 1));
564 const uint8_t *
const src_bottom_ptr = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + 2));
568 float32x4_t top_data = vld1q_f32(reinterpret_cast<const float *>(src_top_ptr + in.offset()));
569 float32x4_t middle_data = vld1q_f32(reinterpret_cast<const float *>(src_middle_ptr + in.offset()));
570 float32x4_t bottom_data = vld1q_f32(reinterpret_cast<const float *>(src_bottom_ptr + in.offset()));
571 float32x2_t res = {};
577 top_data = vmulq_f32(top_data, top_data);
578 middle_data = vmulq_f32(middle_data, middle_data);
579 bottom_data = vmulq_f32(bottom_data, bottom_data);
585 float scale =
calculate_avg_scale(pool_info.exclude_padding,
DataLayout::NCHW,
id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
587 const float32x2_t scale_v = vdup_n_f32(scale);
590 const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data);
591 res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data));
592 res = vmul_f32(vpadd_f32(res, res), scale_v);
596 const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data);
597 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data, 3)), vget_low_f32(max_data));
598 res = vpmax_f32(res, res);
600 final_res = vget_lane_f32(res, 0);
605 final_res = sqrt(final_res);
609 *(
reinterpret_cast<float *
>(out.ptr())) = final_res;
614 void pooling7_fp32_neon_nchw(
const ITensor *src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
617 Iterator in(src, window_src);
618 Iterator out(dst0, window);
620 constexpr
const int pool_size = 7;
621 const int pool_pad_right = pool_info.pad_stride_info.pad_right();
622 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
623 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
624 const int pool_pad_bottom = pool_info.pad_stride_info.pad_bottom();
625 int pool_stride_x = 0;
626 int pool_stride_y = 0;
627 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
628 const int upper_bound_w = src->info()->dimension(0) + (pool_info.exclude_padding ? 0 : pool_pad_right);
629 const int upper_bound_h = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
631 std::array<const uint8_t *, pool_size> src_ptrs{ {} };
634 src_ptrs[i] = src->ptr_to_element(Coordinates(-static_cast<int>(pool_pad_left), -static_cast<int>(pool_pad_top) + i));
639 float32x2_t res = {};
640 float final_res = 0.f;
644 float scale = calculate_avg_scale(pool_info.exclude_padding, DataLayout::NCHW, id, pool_size, pool_size, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
646 const float32x2_t scale_v = vdup_n_f32(scale);
649 float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
651 if(pool_info.pool_type == PoolingType::L2)
653 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
654 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
656 float32x4_t sum_data = vaddq_f32(data.val[0], vsetq_lane_f32(0.f, data.val[1], 3));
657 for(
int i = 1; i < pool_size; ++i)
659 data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
661 if(pool_info.pool_type == PoolingType::L2)
663 data.val[0] = vmulq_f32(data.val[0], data.val[0]);
664 data.val[1] = vmulq_f32(data.val[1], data.val[1]);
666 sum_data = vaddq_f32(sum_data, data.val[0]);
667 sum_data = vaddq_f32(sum_data, vsetq_lane_f32(0.f, data.val[1], 3));
669 res = vpadd_f32(vget_high_f32(sum_data), vget_low_f32(sum_data));
670 res = vmul_f32(vpadd_f32(res, res), scale_v);
674 float32x4x2_t max_data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[0] + in.offset()));
677 const float32x4x2_t data = vld2q_f32(reinterpret_cast<const float *>(src_ptrs[i] + in.offset()));
680 res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits<float>::max(), max_data.val[1], 3)), vget_low_f32(max_data.val[1]));
681 res = vpmax_f32(res, vpmax_f32(vget_high_f32(max_data.val[0]), vget_low_f32(max_data.val[0])));
682 res = vpmax_f32(res, res);
684 final_res = vget_lane_f32(res, 0);
689 final_res = sqrt(final_res);
693 *(
reinterpret_cast<float *
>(out.ptr())) = final_res;
700 #endif // ENABLE_NCHW_KERNELS float16x8_t vmaxq_f16(float16x8_t, float16x8_t)
float16x4_t vmax_f16(float16x4_t, float16x4_t)
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
float16x8_t vaddq_f16(float16x8_t, float16x8_t)
float16x4_t vadd_f16(float16x4_t, float16x4_t)
decltype(strategy::transforms) typedef type
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
float16x4_t vpadd_f16(float16x4_t, float16x4_t)
float16x4_t vpmax_f16(float16x4_t, float16x4_t)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
float16x4_t vmul_f16(float16x4_t, float16x4_t)
float calculate_avg_scale(bool exclude_padding, DataLayout data_layout, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
Num samples, channels, height, width.
uint8x8_t vload(const uint8_t *ptr)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
float32x4x2_t vmax2q_f32(float32x4x2_t a, float32x4x2_t b)
Compute lane-by-lane maximum between elements of a float vector with 4x2 elements.