37 return vqrdmulhq_n_s32(a,
b);
42 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a),
b), 0);
47 const int32x4_t shift = vdupq_n_s32(-exponent);
48 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
49 const int32x4_t fixed = vqaddq_s32(x, fixup);
50 return vrshlq_s32(fixed, shift);
55 const int32x2_t shift = vdup_n_s32(-exponent);
56 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
57 const int32x2_t fixed = vqadd_s32(x, fixup);
58 return vrshl_s32(fixed, shift);
63 const int32x2_t xs = vdup_n_s32(x);
69 template <
typename T,
typename TW>
70 void depthwise_loop_multiplier1_quantized(
const ITensor *
src,
76 std::vector<int> output_multiplier,
77 std::vector<int> output_shift,
82 constexpr
auto element_per_vector =
vector_size /
sizeof(T);
85 using AccType = int32_t;
86 using AccArrayType = std::array<AccType, element_per_vector>;
88 const auto out_of_bound_value =
89 PixelValue(
static_cast<uint64_t
>(0),
src->info()->data_type(),
src->info()->quantization_info()).
get<T>();
90 const auto out_of_bound_vector =
wrapper::vdup_n(
static_cast<T
>(out_of_bound_value), TagType{});
92 const auto run_info = DepthwiseConvolutionRunInfo(*
src->info(), *weights->
info(),
conv_info, window);
94 const int32_t input_qoffset =
src->info()->quantization_info().uniform().offset;
96 const int32_t output_qoffset =
dst->info()->quantization_info().uniform().offset;
97 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
99 Window execution_window = window;
102 Window win_input = window;
107 Window win_weights = win_input;
110 Window win_output = window;
113 Iterator input_it(
src, win_input);
114 Iterator weights_it(weights, win_weights);
115 Iterator output_it(
dst, win_output);
116 Iterator biases_it{};
120 biases_it = Iterator(biases, win_weights);
125 [&](
const Coordinates &
id)
127 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
128 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
129 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
130 auto const base_weights_ptr = weights_it.ptr();
131 size_t x = run_info.x_start;
133 for (; x < run_info.x_leftover_start; x += run_info.x_step)
136 AccArrayType in_sum{};
137 AccArrayType we_sum{};
139 auto weights_ptr = base_weights_ptr;
140 auto input_offset = base_input_offset;
142 for (
size_t h = 0; h < run_info.weights_height; ++h)
144 int64_t offs = input_offset + x *
sizeof(T);
145 for (
size_t w = 0;
w < run_info.weights_width; ++
w)
148 const auto input_vals =
151 input_it.ptr() + std::min(
static_cast<size_t>(offs), run_info.input_max_offset)))
152 : out_of_bound_vector;
153 const auto weights_vals =
154 wrapper::vload(
reinterpret_cast<TW *
>(weights_ptr +
w * run_info.weights_stride_y) + x);
156 for (
size_t i = 0; i < element_per_vector; ++i)
158 acc.at(i) += input_vals[i] * weights_vals[i];
159 in_sum.at(i) += input_vals[i];
160 we_sum.at(i) += weights_vals[i];
163 offs += dilation.
x() * run_info.input_stride_y;
166 weights_ptr += run_info.weights_stride_z;
167 input_offset += dilation.
y() * run_info.input_stride_z;
171 for (
size_t i = 0; i < element_per_vector; ++i)
173 acc.at(i) -= in_sum.at(i) * weights_qoffset;
174 acc.at(i) -= we_sum.at(i) * input_qoffset;
175 acc.at(i) += k_offset;
179 acc.at(i) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + i *
sizeof(int32_t)) + x);
182 const int32_t out_mul = output_multiplier.at(x + i);
183 const int32_t out_shift = output_shift.at(x + i);
195 out_vals[i] =
static_cast<T
>(utility::clamp<AccType, T>(acc.at(i)));
202 for (; x < run_info.x_end; ++x)
208 auto weights_ptr = base_weights_ptr;
209 auto input_offset = base_input_offset;
211 for (
size_t h = 0; h < run_info.weights_height; ++h)
213 int64_t offs = input_offset + x *
sizeof(T);
214 for (
size_t w = 0;
w < run_info.weights_width; ++
w)
217 const auto input_val =
219 ? *
reinterpret_cast<T *
>(input_it.ptr() +
220 std::min(
static_cast<size_t>(offs), run_info.input_max_offset))
221 : out_of_bound_value;
222 const auto weights_val =
223 *(
reinterpret_cast<TW *
>(weights_ptr +
w * run_info.weights_stride_y) + x);
225 acc += input_val * weights_val;
227 we_sum += weights_val;
229 offs += dilation.
x() * run_info.input_stride_y;
232 weights_ptr += run_info.weights_stride_z;
233 input_offset += dilation.
y() * run_info.input_stride_z;
238 acc -= in_sum * weights_qoffset;
239 acc -= we_sum * input_qoffset;
244 acc += *(
reinterpret_cast<int32_t *
>(biases_it.ptr()) + x);
247 const int32_t out_mul = output_multiplier.at(x);
248 const int32_t out_shift = output_shift.at(x);
260 out_vals =
static_cast<T
>(utility::clamp<AccType, T>(acc));
261 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = out_vals;
264 input_it, weights_it, biases_it, output_it);
267 template <
typename T,
typename TW>
268 void depthwise_loop_generic_quantized(
const ITensor *
src,
269 const ITensor *weights,
270 const ITensor *biases,
273 const Size2D &dilation,
274 unsigned int depth_multiplier,
275 std::vector<int> output_multiplier,
276 std::vector<int> output_shift,
277 const Window &window,
280 using AccType = int32_t;
282 const auto run_info =
283 DepthwiseConvolutionRunInfo(*
src->info(), *weights->info(),
conv_info, window, depth_multiplier);
285 const auto out_of_bound_value =
286 PixelValue(
static_cast<uint64_t
>(0),
src->info()->data_type(),
src->info()->quantization_info()).get<T>();
288 const int32_t input_qoffset =
src->info()->quantization_info().uniform().offset;
289 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
290 const int32_t output_qoffset =
dst->info()->quantization_info().uniform().offset;
291 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
293 Window execution_window = window;
294 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
296 Window win_input = execution_window;
300 Window win_weights = window;
301 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
306 Window win_output = window;
307 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
309 Iterator input_it(
src, win_input);
310 Iterator weights_it(weights, win_weights);
311 Iterator output_it(
dst, win_output);
312 Iterator biases_it{};
316 biases_it = Iterator(biases, win_weights);
321 [&](
const Coordinates &
id)
323 std::vector<AccType> acc(depth_multiplier, 0);
324 std::vector<AccType> we_sum(depth_multiplier, 0);
327 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
328 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
329 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
331 auto weights_ptr = weights_it.ptr();
332 for (
size_t h = 0; h < run_info.weights_height; ++h)
334 int offs = input_offset;
335 for (
size_t w = 0;
w < run_info.weights_width; ++
w)
338 const auto input_val =
339 is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(
static_cast<size_t>(offs),
340 run_info.input_max_offset)))
341 : out_of_bound_value;
343 for (
size_t m = 0; m < depth_multiplier; ++m)
345 const auto weights_val =
346 *(
reinterpret_cast<TW *
>(weights_ptr + m *
sizeof(T) +
w * run_info.weights_stride_y));
347 acc.at(m) += input_val * weights_val;
349 we_sum.at(m) += weights_val;
352 offs += dilation.x() * run_info.input_stride_y;
356 weights_ptr += run_info.weights_stride_z;
357 input_offset += dilation.y() * run_info.input_stride_z;
360 for (
size_t m = 0; m < depth_multiplier; ++m)
362 acc.at(m) -= in_sum * weights_qoffset;
363 acc.at(m) -= we_sum.at(m) * input_qoffset;
364 acc.at(m) += k_offset;
368 acc.at(m) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + m *
sizeof(int32_t)));
371 const int32_t out_mul = output_multiplier.at(
id.x() * depth_multiplier + m);
372 const int32_t out_shift = output_shift.at(
id.x() * depth_multiplier + m);
382 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) =
383 static_cast<T
>(utility::clamp<AccType, T>(acc.at(m)));
386 input_it, weights_it, biases_it, output_it);
389 template <
typename T,
typename TW>
390 void depthwise_loop_pow2_quantized_per_tensor(
const ITensor *
src,
391 const ITensor *weights,
392 const ITensor *biases,
395 const Size2D &dilation,
396 unsigned int depth_multiplier,
397 std::vector<int> output_multiplier,
398 std::vector<int> output_shift,
399 const Window &window,
404 using AccType = int32_t;
406 using AccVectorTagType =
typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
407 using TagType =
typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
409 const auto run_info =
410 DepthwiseConvolutionRunInfo(*
src->info(), *weights->info(),
conv_info, window, depth_multiplier);
415 wrapper::vdup_n(
static_cast<TW
>(weights->info()->quantization_info().uniform().offset), TagType{})));
416 const auto output_qoffset_vec =
wrapper::vdup_n(
dst->info()->quantization_info().uniform().offset,
420 const auto upper =
wrapper::vdup_n(
static_cast<AccType
>(std::numeric_limits<T>::max()), AccVectorTagType{});
421 const auto zero =
wrapper::vdup_n(
static_cast<AccType
>(0), AccVectorTagType{});
423 const auto out_mul = output_multiplier.at(0);
424 const auto out_shift = output_shift.at(0);
426 Window execution_window = window;
427 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
429 Window win_input = execution_window;
433 Window win_weights = window;
434 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
439 Window win_output = window;
440 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
442 Iterator input_it(
src, win_input);
443 Iterator weights_it(weights, win_weights);
444 Iterator output_it(
dst, win_output);
445 Iterator biases_it{};
449 biases_it = Iterator(biases, win_weights);
452 std::vector<AccVectorType> acc0(depth_multiplier /
vector_size);
453 std::vector<AccVectorType> acc1(depth_multiplier /
vector_size);
457 [&](
const Coordinates &
id)
462 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
463 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
464 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
466 auto weights_ptr = weights_it.ptr();
467 for (
size_t h = 0; h < run_info.weights_height; ++h)
469 const int32_t current_h = input_z + h * dilation.y();
470 if (current_h >= 0 && current_h <
static_cast<int32_t
>(run_info.input_height))
472 int offs = input_offset;
473 for (
size_t w = 0;
w < run_info.weights_width; ++
w)
475 const int32_t current_w = input_y +
w * dilation.x();
476 if (current_w >= 0 && current_w <
static_cast<int32_t
>(run_info.input_width))
479 *(
reinterpret_cast<T *
>(
480 input_it.ptr() + std::min(
static_cast<size_t>(offs), run_info.input_max_offset))),
483 const auto input_no_offs =
wrapper::vsub(input_s16x8, input_qoffset_vec);
485 for (
size_t m = 0, i = 0; m < depth_multiplier; m +=
vector_size, ++i)
488 weights_ptr + m *
sizeof(T) +
w * run_info.weights_stride_y));
490 const auto weights_no_offs =
wrapper::vsub(weights_s16x8, weights_qoffset_vec);
499 offs += dilation.x() * run_info.input_stride_y;
503 weights_ptr += run_info.weights_stride_z;
504 input_offset += dilation.y() * run_info.input_stride_z;
507 for (
size_t m = 0, i = 0; m < depth_multiplier; m +=
vector_size, ++i)
511 const auto bias_val0 =
512 wrapper::vloadq(
reinterpret_cast<int32_t *
>(biases_it.ptr() + m *
sizeof(int32_t)));
514 reinterpret_cast<int32_t *
>(biases_it.ptr() + (m + half_vec) *
sizeof(int32_t)));
542 if (std::is_same<T, uint8_t>::value)
544 wrapper::vstore(
reinterpret_cast<uint8_t *
>(output_it.ptr() + m *
sizeof(uint8_t)),
549 wrapper::vstore(
reinterpret_cast<int8_t *
>(output_it.ptr() + m *
sizeof(int8_t)),
554 input_it, weights_it, biases_it, output_it);
558 template <
typename T,
typename TW>
568 unsigned int depth_multiplier =
info.depth_multiplier;
570 std::vector<int> output_multiplier;
571 std::vector<int> output_shift;
573 const auto input_scale =
src->info()->quantization_info().uniform().scale;
574 const auto output_scale =
dst->info()->quantization_info().uniform().scale;
581 weights_scale.push_back(weights_scale.front());
585 for (
const auto &s : weights_scale)
587 int32_t out_mult = 0;
588 int32_t out_shift = 0;
589 const float multiplier = input_scale * s / output_scale;
592 output_multiplier.push_back(out_mult);
593 output_shift.push_back(out_shift);
596 if (depth_multiplier == 1)
598 depthwise_loop_multiplier1_quantized<T, TW>(
src, weights, biases,
dst,
conv_info, dilation, output_multiplier,
599 output_shift, window, has_biases);
603 const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
606 if (is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
608 depthwise_loop_pow2_quantized_per_tensor<T, TW>(
src, weights, biases,
dst,
conv_info, dilation,
609 depth_multiplier, output_multiplier, output_shift, window,
614 depthwise_loop_generic_quantized<T, TW>(
src, weights, biases,
dst,
conv_info, dilation, depth_multiplier,
615 output_multiplier, output_shift, window, has_biases);