31 void qasymm8_signed_neon_scale_bilinear(
const ITensor *
src,
33 const ITensor *offsets,
37 PixelValue constant_border_value,
38 float sampling_offset,
43 const UniformQuantizationInfo iq_info =
src->info()->quantization_info().uniform();
44 const UniformQuantizationInfo oq_info =
dst->info()->quantization_info().uniform();
46 const int32_t input_width =
src->info()->dimension(1);
47 const int32_t input_height =
src->info()->dimension(2);
57 const int32_t in_stride_y =
src->info()->strides_in_bytes()[1];
58 const int32_t in_stride_z =
src->info()->strides_in_bytes()[2];
66 Window win_in(window);
67 win_in.set(1, Window::Dimension(0, 0, 0));
68 win_in.set(2, Window::Dimension(0, 0, 0));
70 for (
size_t d =
Window::DimZ; d < offsets->info()->num_dimensions(); ++d)
72 win_off.set(d, Window::Dimension(0, 0, 0));
75 Iterator in(
src, win_in);
76 Iterator out(
dst, window);
78 const int8_t const_border_value =
static_cast<int8_t
>(constant_border_value.get<int8_t>());
81 [&](
const Coordinates &
id)
83 const int32_t index_h = std::floor((
id[2] + sampling_offset) *
scale_y - sampling_offset);
84 const int32_t index_w =
85 *(
reinterpret_cast<const int32_t *
>(offsets->ptr_to_element(Coordinates(
id[1],
id[2]))));
86 const auto dx_val = *(
reinterpret_cast<const float *
>(dx->ptr_to_element(Coordinates(
id[1],
id[2]))));
87 const auto dy_val = *(
reinterpret_cast<const float *
>(dy->ptr_to_element(Coordinates(
id[1],
id[2]))));
88 const auto pixel_row_ptr =
reinterpret_cast<const int8_t *
>(in.ptr());
90 const auto a00 = (0 <= index_w && index_w < input_width && 0 <= index_h && index_h < input_height)
91 ? (*(pixel_row_ptr + index_w * in_stride_y + index_h * in_stride_z))
93 const auto a01 = (-1 <= index_w && index_w + 1 < input_width && 0 <= index_h && index_h < input_height)
94 ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + index_h * in_stride_z))
96 const auto a10 = (0 <= index_w && index_w < input_width && -1 <= index_h && index_h < input_height - 1)
97 ? (*(pixel_row_ptr + index_w * in_stride_y + (index_h + 1) * in_stride_z))
100 (-1 <= index_w && index_w < input_width - 1 && -1 <= index_h && index_h < input_height - 1)
101 ? (*(pixel_row_ptr + (index_w + 1) * in_stride_y + (index_h + 1) * in_stride_z))
102 : const_border_value;
115 using FloatTagType =
typename wrapper::traits::neon_bitvector_tag_t<float, wrapper::traits::BitWidth::W128>;
116 using Int32TagType =
typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
118 const int in_stride_x =
src->info()->strides_in_bytes()[1];
119 const int in_stride_y =
src->info()->strides_in_bytes()[2];
120 const int in_stride_b =
src->info()->strides_in_bytes()[3];
121 const int out_stride_x =
dst->info()->strides_in_bytes()[1];
122 const int out_stride_y =
dst->info()->strides_in_bytes()[2];
123 const int out_stride_b =
dst->info()->strides_in_bytes()[3];
124 const int out_dim_ch =
dst->info()->dimension(0);
125 constexpr
int step_cout = 16;
127 Window window_execution = window;
128 window_execution.set(
Window::DimX, Window::Dimension(0, 1, 1));
129 Window win_in_out(window);
130 win_in_out.set(
Window::DimY, Window::Dimension(0, 0, 0));
131 win_in_out.set(
Window::DimZ, Window::Dimension(0, 0, 0));
132 Iterator in(
src, win_in_out);
133 Iterator out(
dst, win_in_out);
135 const int xo_start = window_execution[1].start();
136 const int xo_end = window_execution[1].end();
137 const int xo_step = window_execution[1].step();
138 const int yo_start = window_execution[2].start();
139 const int yo_end = window_execution[2].end();
140 const int yo_step = window_execution[2].step();
141 const int bo_start = window_execution[3].start();
142 const int bo_end = window_execution[3].end();
143 const int bo_step = window_execution[3].step();
145 const float fp_coord_offset_y = sampling_offset * (
scale_y - 1);
146 const float fp_coord_offset_x = sampling_offset * (
scale_x - 1);
148 const UniformQuantizationInfo iq_info =
src->info()->quantization_info().uniform();
149 const UniformQuantizationInfo oq_info =
dst->info()->quantization_info().uniform();
151 const float32x4_t vscale_in =
wrapper::vdup_n(iq_info.scale, FloatTagType{});
152 const int32x4_t voffset_in =
wrapper::vdup_n(iq_info.offset, Int32TagType{});
154 const float32x4_t invvscale_o =
wrapper::vdup_n(1.f / oq_info.scale, FloatTagType{});
155 const float32x4_t voffset_o = vdupq_n_f32(oq_info.offset);
157 for (
int bo = bo_start; bo < bo_end; bo += bo_step)
159 const int8_t *in_ptr =
reinterpret_cast<int8_t *
>(in.ptr() + bo * in_stride_b);
160 int8_t *out_ptr =
reinterpret_cast<int8_t *
>(out.ptr() + bo * out_stride_b);
162 for (
int yo = yo_start; yo < yo_end; yo += yo_step)
165 const float yi_f = yo *
scale_y + fp_coord_offset_y;
167 const int yi =
static_cast<int>(std::floor(yi_f));
169 const float a1 = (yi_f -
static_cast<float>(yi));
170 const float b1 = (1.f - a1);
172 const int yi0 = utility::clamp<int>(yi, 0, input_height - 1);
173 const int yi1 = utility::clamp<int>(yi + 1, 0, input_height - 1);
175 const int8_t *in_ptr_yi0 = in_ptr + yi0 * in_stride_y;
176 const int8_t *in_ptr_yi1 = in_ptr + yi1 * in_stride_y;
178 int8_t *out_ptr_yo = out_ptr + yo * out_stride_y;
179 for (
int xo = xo_start; xo < xo_end; xo += xo_step)
182 const float xi_f = xo *
scale_x + fp_coord_offset_x;
184 const int xi =
static_cast<int>(std::floor(xi_f));
186 const float a = (xi_f -
static_cast<float>(xi));
187 const float b = (1.f - a);
189 const float s00_s =
b * b1;
190 const float s01_s = a * b1;
191 const float s10_s =
b * a1;
192 const float s11_s = a * a1;
199 const int xi0 = utility::clamp<int>(xi, 0, input_width - 1);
200 const int xi1 = utility::clamp<int>(xi + 1, 0, input_width - 1);
202 const auto in_ptr_xi0_yi0 = in_ptr_yi0 + xi0 * in_stride_x;
203 const auto in_ptr_xi1_yi0 = in_ptr_yi0 + xi1 * in_stride_x;
204 const auto in_ptr_xi0_yi1 = in_ptr_yi1 + xi0 * in_stride_x;
205 const auto in_ptr_xi1_yi1 = in_ptr_yi1 + xi1 * in_stride_x;
207 int8_t *out_ptr_xo_yo = out_ptr_yo + xo * out_stride_x;
210 for (; cout <= (out_dim_ch - step_cout); cout += step_cout)
212 const auto in00 =
wrapper::vloadq(in_ptr_xi0_yi0 + cout *
sizeof(int8_t));
213 const auto in01 =
wrapper::vloadq(in_ptr_xi1_yi0 + cout *
sizeof(int8_t));
214 const auto in10 =
wrapper::vloadq(in_ptr_xi0_yi1 + cout *
sizeof(int8_t));
215 const auto in11 =
wrapper::vloadq(in_ptr_xi1_yi1 + cout *
sizeof(int8_t));
305 #if defined(__aarch64__) && !defined(BARE_METAL)
306 const auto out_0_int = wrapper::vcvta<int32_t>(
wrapper::vmla(voffset_o, out_0, invvscale_o));
307 const auto out_1_int = wrapper::vcvta<int32_t>(
wrapper::vmla(voffset_o, out_1, invvscale_o));
308 const auto out_2_int = wrapper::vcvta<int32_t>(
wrapper::vmla(voffset_o, out_2, invvscale_o));
309 const auto out_3_int = wrapper::vcvta<int32_t>(
wrapper::vmla(voffset_o, out_3, invvscale_o));
310 #else // defined(__aarch64__) && !defined(BARE_METAL)
311 const auto out_0_int = wrapper::vcvt<int32_t>(
wrapper::vmla(voffset_o, out_0, invvscale_o));
312 const auto out_1_int = wrapper::vcvt<int32_t>(
wrapper::vmla(voffset_o, out_1, invvscale_o));
313 const auto out_2_int = wrapper::vcvt<int32_t>(
wrapper::vmla(voffset_o, out_2, invvscale_o));
314 const auto out_3_int = wrapper::vcvt<int32_t>(
wrapper::vmla(voffset_o, out_3, invvscale_o));
315 #endif // defined(__aarch64__) && !defined(BARE_METAL)
316 const auto low_part =
318 const auto high_part =
325 for (; cout < out_dim_ch; ++cout)
327 const int8_t in00 = *(in_ptr_xi0_yi0 + cout *
sizeof(int8_t));
328 const int8_t in01 = *(in_ptr_xi1_yi0 + cout *
sizeof(int8_t));
329 const int8_t in10 = *(in_ptr_xi0_yi1 + cout *
sizeof(int8_t));
330 const int8_t in11 = *(in_ptr_xi1_yi1 + cout *
sizeof(int8_t));
332 const float in00_f = (
static_cast<int32_t
>(in00) - iq_info.offset) * iq_info.scale;
333 const float in01_f = (
static_cast<int32_t
>(in01) - iq_info.offset) * iq_info.scale;
334 const float in10_f = (
static_cast<int32_t
>(in10) - iq_info.offset) * iq_info.scale;
335 const float in11_f = (
static_cast<int32_t
>(in11) - iq_info.offset) * iq_info.scale;
337 float out = in00_f * s00_s;
338 out += in01_f * s01_s;
339 out += in10_f * s10_s;
340 out += in11_f * s11_s;
343 #if defined(__aarch64__) && !defined(BARE_METAL)
345 #else // defined(__aarch64__) && !defined(BARE_METAL)
346 *(out_ptr_xo_yo + cout *
sizeof(int8_t)) =
348 #endif // defined(__aarch64__) && !defined(BARE_METAL)
370 float sampling_offset,
376 if (
src->info()->quantization_info() ==
dst->info()->quantization_info() &&
379 s8_neon_scale(
src,
dst, offsets, dx, dy, policy, border_mode, constant_border_value, sampling_offset,
380 align_corners, window);
384 qasymm8_signed_neon_scale_bilinear(
src,
dst, offsets, dx, dy, border_mode, constant_border_value,
385 sampling_offset, align_corners, window);
390 nearest_neon_scale<int8_t>(
src,
dst, offsets, sampling_offset, align_corners, window);