37 void pooling2_f32_maxpool_indices(
const ITensor *
src, ITensor *dst0, ITensor *dst1, PoolingLayerInfo &pool_info,
const Window &window_src,
const Window &window)
39 const int window_start_x = window.x().start();
40 const int window_end_x = window.x().end();
41 const int window_step_x = 4;
43 Window window_out = window;
44 window_out.set(
Window::DimX, Window::Dimension(0, 1, 1));
46 Iterator in(
src, window_src);
47 Iterator out(dst0, window_out);
48 Iterator indices(dst1, window_out);
50 const int pool_pad_top = pool_info.pad_stride_info.pad_top();
51 const int pool_pad_left = pool_info.pad_stride_info.pad_left();
53 int pool_stride_x = 0;
54 int pool_stride_y = 0;
55 std::tie(pool_stride_x, pool_stride_y) = pool_info.pad_stride_info.stride();
60 const int pad_right =
src->info()->padding().right;
61 const int pad_left =
src->info()->padding().left;
62 const int pad_horizontal = pad_right + pad_left;
63 const int in_stride_y =
static_cast<int>(
src->info()->strides_in_bytes().y());
64 const int in_stride_z =
static_cast<int>(
src->info()->strides_in_bytes().z());
68 const int idx_width =
id.y() * pool_stride_x;
70 const int pool_limit_y = pool_pad_top -
idx_height;
71 const int pool_limit_x = pool_pad_left -
idx_width;
73 const int pool_start_y = std::max(0, window_src.z().start() + pool_limit_y);
74 const int pool_start_x = std::max(0, window_src.y().start() + pool_limit_x);
76 const int in_x0_offset = (pool_start_x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) *
static_cast<int>(
src->info()->strides_in_bytes().z());
77 const int in_x1_offset = (pool_start_x + 1 - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (pool_start_y - pool_pad_top) *
static_cast<int>
78 (
src->info()->strides_in_bytes().z());
79 const int in_x2_offset = (pool_start_x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) *
static_cast<int>
80 (
src->info()->strides_in_bytes().z());
81 const int in_x3_offset = (pool_start_x + 1 - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (pool_start_y + 1 - pool_pad_top) *
static_cast<int>
82 (
src->info()->strides_in_bytes().z());
84 int x_off = window_start_x;
85 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
87 const auto in_x0_ptr =
reinterpret_cast<const float *
>(in.ptr() + in_x0_offset);
88 const auto in_x1_ptr =
reinterpret_cast<const float *
>(in.ptr() + in_x1_offset);
89 const auto in_x2_ptr =
reinterpret_cast<const float *
>(in.ptr() + in_x2_offset);
90 const auto in_x3_ptr =
reinterpret_cast<const float *
>(in.ptr() + in_x3_offset);
91 const auto v_x0 = vld1q_f32(in_x0_ptr + x_off);
92 const auto v_x1 = vld1q_f32(in_x1_ptr + x_off);
93 const auto v_x2 = vld1q_f32(in_x2_ptr + x_off);
94 const auto v_x3 = vld1q_f32(in_x3_ptr + x_off);
95 vres = vmaxq_f32(vmaxq_f32(v_x2, v_x3), vmaxq_f32(v_x0, v_x1));
97 vst1q_f32(
reinterpret_cast<float *
>(out.ptr()) + x_off, vres);
99 const uint32_t offset_base = offset_no_padding<float>(in.offset(),
id, *
src->info(), pool_stride_x, pool_stride_y,
DataLayout::NHWC);
100 const uint32_t offset_x0 = offset_base /
sizeof(float) + x_off;
101 const uint32_t offset_x1 = offset_x0 + in_stride_y /
sizeof(float) - pad_horizontal;
102 const uint32_t offset_x2 = offset_x0 + in_stride_z /
sizeof(float) - pad_horizontal *
src->info()->tensor_shape()[1];
103 const uint32_t offset_x3 = offset_x2 + in_stride_y /
sizeof(float) - pad_horizontal;
104 const uint32x4_t voffset_x0 = { offset_x0, offset_x0 + 1, offset_x0 + 2, offset_x0 + 3 };
105 const uint32x4_t voffset_x1 = { offset_x1, offset_x1 + 1, offset_x1 + 2, offset_x1 + 3 };
106 const uint32x4_t voffset_x2 = { offset_x2, offset_x2 + 1, offset_x2 + 2, offset_x2 + 3 };
107 const uint32x4_t voffset_x3 = { offset_x3, offset_x3 + 1, offset_x3 + 2, offset_x3 + 3 };
108 const uint32x4_t tmp_indices0 = vbslq_u32(vcgeq_f32(v_x0, v_x1), voffset_x0, voffset_x1);
109 const uint32x4_t tmp_indices1 = vbslq_u32(vcgeq_f32(v_x2, v_x3), voffset_x2, voffset_x3);
110 const uint32x4_t tmp_indices2 = vbslq_u32(vcgeq_f32(vmaxq_f32(v_x0, v_x1), vmaxq_f32(v_x2, v_x3)), tmp_indices0, tmp_indices1);
113 vst1q_u32(
reinterpret_cast<uint32_t *
>(indices.ptr()) + x_off, tmp_indices2);
117 for(; x_off < window_end_x; ++x_off)
119 const auto x0 = *(
reinterpret_cast<const float *
>(in.ptr() + in_x0_offset) + x_off);
120 const auto x1 = *(
reinterpret_cast<const float *
>(in.ptr() + in_x1_offset) + x_off);
121 const auto x2 = *(
reinterpret_cast<const float *
>(in.ptr() + in_x2_offset) + x_off);
122 const auto x3 = *(
reinterpret_cast<const float *
>(in.ptr() + in_x3_offset) + x_off);
123 res = std::max(std::max(x2, x3), std::max(x0, x1));
126 *(
reinterpret_cast<float *
>(out.ptr()) + x_off) = res;
128 const uint32_t offset_base = offset_no_padding<float>(in.offset(),
id, *
src->info(), pool_stride_x, pool_stride_y,
DataLayout::NHWC);
129 const uint32_t offset_x0 = offset_base /
sizeof(float) + x_off;
130 const uint32_t offset_x1 = offset_x0 + in_stride_y /
sizeof(float) - pad_horizontal;
131 const uint32_t offset_x2 = offset_x0 + in_stride_z /
sizeof(float) - pad_horizontal *
src->info()->tensor_shape()[1];
132 const uint32_t offset_x3 = offset_x2 + in_stride_y /
sizeof(float) - pad_horizontal;
133 const uint32_t tmp_idx0 = (x0 >= x1) ? offset_x0 : offset_x1;
134 const uint32_t tmp_idx1 = (x2 >= x3) ? offset_x2 : offset_x3;
135 const uint32_t tmp_idx2 = (std::max(x0, x1) >= std::max(x2, x3)) ? tmp_idx0 : tmp_idx1;
138 *(
reinterpret_cast<uint32_t *
>(indices.ptr()) + x_off) = tmp_idx2;
147 const int window_start_x = window.
x().
start();
148 const int window_end_x = window.
x().
end();
149 constexpr
int window_step_x = 4;
151 Window window_out = window;
163 int pool_stride_x = 0;
164 int pool_stride_y = 0;
174 constexpr
int idx_batch = 3;
176 const int y_stride =
static_cast<int>(
src->info()->strides_in_bytes().y());
177 const int z_stride =
static_cast<int>(
src->info()->strides_in_bytes().z());
178 const int n_stride =
static_cast<int>(
src->info()->strides_in_bytes()[idx_batch]);
180 const int input_dim_w =
src->info()->dimension(
idx_width);
183 const uint8_t *in_ptr_start =
src->buffer() +
src->info()->offset_first_element_in_bytes();
187 const int idx_width =
static_cast<int>(
id.y()) * pool_stride_x - pool_pad_left;
188 const int idx_height =
static_cast<int>(
id.z()) * pool_stride_y - pool_pad_top;
190 const int pool_start_x = std::max(0, -
idx_width);
191 const int pool_start_y = std::max(0, -
idx_height);
193 const int pool_end_x = std::min(pool_size_x, input_dim_w -
idx_width);
194 const int pool_end_y = std::min(pool_size_y, input_dim_h -
idx_height);
196 const uint8_t *in_ptr_n = in_ptr_start +
id[idx_batch] * n_stride;
198 const int in_ptr_y_offset = (z_stride *
idx_height) + (pool_start_y * z_stride);
199 const int in_ptr_x_offset = (y_stride *
idx_width) + (pool_start_x * y_stride);
201 int x_off = window_start_x;
203 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
205 vres = vdupq_n_f32(min_value);
206 vidx = vdupq_n_u32(0
U);
207 const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
208 uint32_t curr_kernel_index = pool_size_x * pool_start_y;
209 for(
int y = pool_start_y; y < pool_end_y; ++y)
211 const uint8_t *in_ptr_x = in_ptr_y + (x_off *
sizeof(float));
212 curr_kernel_index += pool_start_x;
213 for(
int x = pool_start_x; x < pool_end_x; ++x)
215 const float32x4_t data = vld1q_f32(
reinterpret_cast<const float *
>(in_ptr_x));
216 const uint32x4_t vidx_curr = vdupq_n_u32(curr_kernel_index);
217 const uint32x4_t idxMask = vcgtq_f32(data, vres);
218 vidx = vbslq_u32(idxMask, vidx_curr, vidx);
219 vres = vmaxq_f32(vres, data);
220 in_ptr_x += y_stride;
223 curr_kernel_index += (pool_size_x - pool_end_x);
224 in_ptr_y += z_stride;
227 vst1q_f32(
reinterpret_cast<float *
>(out.
ptr()) + x_off, vres);
228 vst1q_u32(
reinterpret_cast<uint32_t *
>(indices.
ptr()) + x_off, vidx);
232 for(; x_off < window_end_x; ++x_off)
234 float res = min_value;
236 const uint8_t *in_ptr_y = in_ptr_n + in_ptr_y_offset + in_ptr_x_offset;
237 for(
int y = pool_start_y; y < pool_end_y; ++y)
239 const uint8_t *in_ptr_x = in_ptr_y + (x_off *
sizeof(float));
240 for(
int x = pool_start_x; x < pool_end_x; ++x)
242 const float data = *(
reinterpret_cast<const float *
>(in_ptr_x));
245 idx = pool_size_x * y + x;
248 in_ptr_x += y_stride;
250 in_ptr_y += z_stride;
254 *(
reinterpret_cast<float *
>(out.
ptr()) + x_off) = res;
255 *(
reinterpret_cast<uint32_t *
>(indices.
ptr()) + x_off) = idx;
269 pooling2_f32_maxpool_indices(
src, dst0, dst1, pool_info, window_src, window);
273 const int window_start_x = window.
x().
start();
274 const int window_end_x = window.
x().
end();
275 const int window_step_x = 4;
277 Window window_out = window;
289 int pool_stride_x = 0;
290 int pool_stride_y = 0;
292 const int upper_bound_w =
src->info()->dimension(1) + (pool_info.
exclude_padding ? 0 : pool_pad_right);
293 const int upper_bound_h =
src->info()->dimension(2) + (pool_info.
exclude_padding ? 0 : pool_pad_bottom);
299 const int idx_width =
id.y() * pool_stride_x;
300 const int idx_height =
id.z() * pool_stride_y;
301 const int pool_limit_y = pool_pad_top -
idx_height;
302 const int pool_limit_x = pool_pad_left -
idx_width;
304 const int pool_start_y = std::max(0, window_src.
z().
start() + pool_limit_y);
305 const int pool_end_y = std::min(pool_size_y, window_src.
z().
end() + pool_limit_y);
306 const int pool_start_x = std::max(0, window_src.
y().
start() + pool_limit_x);
307 const int pool_end_x = std::min(pool_size_x, window_src.
y().
end() + pool_limit_x);
309 int x_off = window_start_x;
310 for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x)
315 const float scale = calculate_avg_scale_pool2d(pool_info.
exclude_padding,
DataLayout::NHWC,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
317 const float32x4_t scale_v = vdupq_n_f32(
scale);
320 vres = vdupq_n_f32(0.0f);
322 for(
int y = pool_start_y; y < pool_end_y; ++y)
324 for(
int x = pool_start_x; x < pool_end_x; ++x)
326 const float32x4_t data = vld1q_f32(
reinterpret_cast<const float *
>(in.
ptr() + (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (y - pool_pad_top) *
static_cast<int>
327 (
src->info()->strides_in_bytes().z())) + x_off);
332 vres = vmlaq_f32(vres, data, data);
336 vres = vaddq_f32(vres, data);
341 vres = vmulq_f32(vres, scale_v);
345 vres = vdupq_n_f32(min_value);
346 for(
int y = pool_start_y; y < pool_end_y; ++y)
348 for(
int x = pool_start_x; x < pool_end_x; ++x)
350 const float32x4_t data = vld1q_f32(
reinterpret_cast<const float *
>(in.
ptr() + (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (y - pool_pad_top) *
static_cast<int>
351 (
src->info()->strides_in_bytes().z())) + x_off);
352 vres = vmaxq_f32(vres, data);
360 float32x4_t l2_res = {
static_cast<float>(sqrt(vgetq_lane_f32(vres, 0))),
361 static_cast<float>(sqrt(vgetq_lane_f32(vres, 1))),
362 static_cast<float>(sqrt(vgetq_lane_f32(vres, 2))),
363 static_cast<float>(sqrt(vgetq_lane_f32(vres, 3)))
369 vst1q_f32(
reinterpret_cast<float *
>(out.
ptr()) + x_off, vres);
373 for(; x_off < window_end_x; ++x_off)
380 const float scale = calculate_avg_scale_pool2d(pool_info.
exclude_padding,
DataLayout::NHWC,
id, pool_size_x, pool_size_y, upper_bound_w, upper_bound_h, pool_pad_left, pool_pad_top, pool_stride_x,
383 for(
int y = pool_start_y; y < pool_end_y; ++y)
385 for(
int x = pool_start_x; x < pool_end_x; ++x)
387 const float data = *(
reinterpret_cast<const float *
>(in.
ptr() + (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (y - pool_pad_top) *
static_cast<int>
388 (
src->info()->strides_in_bytes().z())) + x_off);
408 for(
int y = pool_start_y; y < pool_end_y; ++y)
410 for(
int x = pool_start_x; x < pool_end_x; ++x)
412 const float data = *(
reinterpret_cast<const float *
>(in.
ptr() + (x - pool_pad_left) *
static_cast<int>(
src->info()->strides_in_bytes().y()) + (y - pool_pad_top) *
static_cast<int>
413 (
src->info()->strides_in_bytes().z())) + x_off);
414 res = std::max(res, data);
422 res = std::sqrt(res);
426 *(
reinterpret_cast<float *
>(out.
ptr()) + x_off) = res;