39 constexpr
auto dim_manual_loop = Window::Dimension(0, 0, 0);
40 constexpr
auto dim_single_unit_step = Window::Dimension(0, 1, 1);
41 constexpr
size_t vector_size = 8;
43 struct DepthwiseConvolutionRunInfo
65 DepthwiseConvolutionRunInfo(
const ITensorInfo &
input,
const ITensorInfo &weights,
const PadStrideInfo &
conv_info,
const Window &
w, uint32_t depth_multiplier = 1)
67 x_start(w.x().start()),
69 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
70 x_leftover_start(
std::max(static_cast<int32_t>(w.x().
end() + 1) - static_cast<int32_t>(x_step), int32_t(0))),
71 input_stride_y(input.strides_in_bytes().y()),
72 input_stride_z(input.strides_in_bytes().z()),
73 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
74 weights_width(weights.dimension(width_idx)),
75 weights_height(weights.dimension(height_idx)),
76 weights_stride_y(weights.strides_in_bytes().y()),
77 weights_stride_z(weights.strides_in_bytes().z()),
78 conv_stride_x(conv_info.stride().first),
79 conv_stride_y(conv_info.stride().second),
80 conv_pad_left(conv_info.pad_left()),
81 conv_pad_top(conv_info.pad_top()),
82 input_height(input.dimension(height_idx)),
83 input_width(input.dimension(width_idx)),
84 input_depth(input.dimension(channel_idx))
89 inline int32x4_t saturating_doubling_high_mul(
const int32x4_t &a,
const int32_t &
b)
91 return vqrdmulhq_n_s32(a, b);
94 inline int32_t saturating_doubling_high_mul(
const int32_t &a,
const int32_t &b)
96 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
99 inline int32x4_t rounding_divide_by_exp2(
const int32x4_t &x,
const int exponent)
101 const int32x4_t shift = vdupq_n_s32(-exponent);
102 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
103 const int32x4_t fixed = vqaddq_s32(x, fixup);
104 return vrshlq_s32(fixed, shift);
107 inline int32x2_t rounding_divide_by_exp2(
const int32x2_t &x,
const int exponent)
109 const int32x2_t shift = vdup_n_s32(-exponent);
110 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
111 const int32x2_t fixed = vqadd_s32(x, fixup);
112 return vrshl_s32(fixed, shift);
115 inline int32_t rounding_divide_by_exp2(
const int32_t &x,
const int exponent)
117 const int32x2_t xs = vdup_n_s32(x);
118 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
121 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t
w, uint32_t h,
const DepthwiseConvolutionRunInfo &run_info,
const Size2D &dilation)
123 const int32_t current_h = base_h + h * dilation.y();
124 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
126 const int32_t current_w = base_w + w * dilation.x();
127 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
129 return is_valid_h && is_valid_w;
132 template <
typename T>
133 void depthwise_loop_multiplier1_fp(
const ITensor *
src,
const ITensor *weights,
const ITensor *biases, ITensor *
dst,
const PadStrideInfo &
conv_info,
134 const Size2D &dilation,
const Window &window,
bool has_biases)
136 constexpr
auto element_per_vector = vector_size /
sizeof(T);
138 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
140 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window);
142 const VectorType zero_vector =
wrapper::vdup_n(static_cast<T>(0), TagType{});
144 Window execution_window = window;
145 execution_window.set(
Window::DimX, dim_single_unit_step);
147 Window win_input = window;
152 Window win_weights = win_input;
155 Window win_output = window;
158 Iterator input_it(src, win_input);
159 Iterator weights_it(weights, win_weights);
160 Iterator output_it(dst, win_output);
161 Iterator biases_it{};
165 biases_it = Iterator(biases, win_weights);
170 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
171 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
172 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
174 auto const base_weights_ptr = weights_it.ptr();
175 uint32_t x = run_info.x_start;
177 for(; x < run_info.x_leftover_start; x += run_info.x_step)
179 VectorType acc = zero_vector;
180 auto weights_ptr = base_weights_ptr;
181 int64_t input_offset = base_input_offset;
183 for(uint32_t h = 0; h < run_info.weights_height; ++h)
185 int64_t offs = input_offset + x *
sizeof(T);
186 for(uint32_t w = 0; w < run_info.weights_width; ++
w)
188 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
189 const auto input_vals = is_valid_region ?
190 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
192 const auto weights_vals =
wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
195 offs += dilation.x() * run_info.input_stride_y;
198 weights_ptr += run_info.weights_stride_z;
199 input_offset += dilation.y() * run_info.input_stride_z;
204 const auto biases_vals =
wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
211 for(; x < run_info.x_end; ++x)
213 auto acc_scalar = T{ 0 };
214 auto weights_ptr = base_weights_ptr;
215 int64_t input_offset = base_input_offset;
217 for(
size_t h = 0; h < run_info.weights_height; ++h)
219 int64_t offs = input_offset + x *
sizeof(T);
220 for(
size_t w = 0; w < run_info.weights_width; ++
w)
222 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
223 const auto input_vals = is_valid_region ? *
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
224 const auto weights_vals = *(
reinterpret_cast<T *
>(weights_ptr + w * run_info.weights_stride_y) + x);
226 acc_scalar += (input_vals * weights_vals);
228 offs += dilation.x() * run_info.input_stride_y;
231 weights_ptr += run_info.weights_stride_z;
232 input_offset += dilation.y() * run_info.input_stride_z;
237 const auto biases_vals = *(
reinterpret_cast<T *
>(biases_it.ptr()) + x);
238 acc_scalar += biases_vals;
240 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = acc_scalar;
243 input_it, weights_it, biases_it, output_it);
246 template <
typename T>
247 void depthwise_loop_generic_fp(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
248 const Size2D &dilation,
unsigned int depth_multiplier,
const Window &window,
bool has_biases)
250 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window, depth_multiplier);
252 Window execution_window = window;
253 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
255 Window win_input = execution_window;
256 win_input.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
260 Window win_weights = window;
261 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
266 Window win_output = window;
267 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
269 Iterator input_it(src, win_input);
270 Iterator weights_it(weights, win_weights);
271 Iterator output_it(dst, win_output);
272 Iterator biases_it{};
276 biases_it = Iterator(biases, win_weights);
281 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
283 const int input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
284 const int input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
285 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
287 auto weights_ptr = weights_it.ptr();
288 for(
size_t h = 0; h < run_info.weights_height; ++h)
290 int offs = input_offset;
291 for(
size_t w = 0; w < run_info.weights_width; ++
w)
293 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
294 const auto input_val = is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
296 for(
size_t m = 0;
m < depth_multiplier; ++
m)
298 const auto weights_val = *(
reinterpret_cast<T *
>(weights_ptr +
m *
sizeof(T) + w * run_info.weights_stride_y));
302 offs += dilation.x() * run_info.input_stride_y;
305 weights_ptr += run_info.weights_stride_z;
306 input_offset += dilation.y() * run_info.input_stride_z;
311 for(
size_t m = 0;
m < depth_multiplier; ++
m)
313 const auto biases_val = *(
reinterpret_cast<T *
>(biases_it.ptr() +
m *
sizeof(T)));
314 *(
reinterpret_cast<T *
>(output_it.ptr() +
m *
sizeof(T))) = acc.at(
m) + biases_val;
319 for(
size_t m = 0;
m < depth_multiplier; ++
m)
321 *(
reinterpret_cast<T *
>(output_it.ptr() +
m *
sizeof(T))) = acc.at(
m);
325 input_it, weights_it, biases_it, output_it);
328 template <
typename T,
typename TW>
329 void depthwise_loop_multiplier1_quantized(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
330 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
333 constexpr
auto element_per_vector = vector_size /
sizeof(T);
335 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
336 using AccType = int32_t;
337 using AccArrayType = std::array<AccType, element_per_vector>;
339 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
340 const auto out_of_bound_vector =
wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
342 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window);
344 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
345 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
346 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
347 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
349 Window execution_window = window;
350 execution_window.set(
Window::DimX, dim_single_unit_step);
352 Window win_input = window;
357 Window win_weights = win_input;
360 Window win_output = window;
363 Iterator input_it(src, win_input);
364 Iterator weights_it(weights, win_weights);
365 Iterator output_it(dst, win_output);
366 Iterator biases_it{};
370 biases_it = Iterator(biases, win_weights);
375 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
376 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
377 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
378 auto const base_weights_ptr = weights_it.ptr();
379 size_t x = run_info.x_start;
381 for(; x < run_info.x_leftover_start; x += run_info.x_step)
384 AccArrayType in_sum{};
385 AccArrayType we_sum{};
387 auto weights_ptr = base_weights_ptr;
388 auto input_offset = base_input_offset;
390 for(
size_t h = 0; h < run_info.weights_height; ++h)
392 int64_t offs = input_offset + x *
sizeof(T);
393 for(
size_t w = 0; w < run_info.weights_width; ++
w)
395 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
396 const auto input_vals = is_valid_region ?
397 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
399 const auto weights_vals =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
401 for(
size_t i = 0; i < element_per_vector; ++i)
403 acc.at(i) += input_vals[i] * weights_vals[i];
404 in_sum.at(i) += input_vals[i];
405 we_sum.at(i) += weights_vals[i];
408 offs += dilation.x() * run_info.input_stride_y;
411 weights_ptr += run_info.weights_stride_z;
412 input_offset += dilation.y() * run_info.input_stride_z;
416 for(
size_t i = 0; i < element_per_vector; ++i)
418 acc.at(i) -= in_sum.at(i) * weights_qoffset;
419 acc.at(i) -= we_sum.at(i) * input_qoffset;
420 acc.at(i) += k_offset;
424 acc.at(i) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + i *
sizeof(int32_t)) + x);
427 const int32_t out_mul = output_multiplier.at(x + i);
428 const int32_t out_shift = output_shift.at(x + i);
431 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
435 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
437 out_vals[i] =
static_cast<T
>(utility::clamp<AccType, T>(acc.at(i)));
444 for(; x < run_info.x_end; ++x)
450 auto weights_ptr = base_weights_ptr;
451 auto input_offset = base_input_offset;
453 for(
size_t h = 0; h < run_info.weights_height; ++h)
455 int64_t offs = input_offset + x *
sizeof(T);
456 for(
size_t w = 0; w < run_info.weights_width; ++
w)
458 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
459 const auto input_val = is_valid_region ?
460 *
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
462 const auto weights_val = *(
reinterpret_cast<TW *
>(weights_ptr + w * run_info.weights_stride_y) + x);
464 acc += input_val * weights_val;
466 we_sum += weights_val;
468 offs += dilation.x() * run_info.input_stride_y;
471 weights_ptr += run_info.weights_stride_z;
472 input_offset += dilation.y() * run_info.input_stride_z;
477 acc -= in_sum * weights_qoffset;
478 acc -= we_sum * input_qoffset;
483 acc += *(
reinterpret_cast<int32_t *
>(biases_it.ptr()) + x);
486 const int32_t out_mul = output_multiplier.at(x);
487 const int32_t out_shift = output_shift.at(x);
491 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
495 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
498 out_vals =
static_cast<T
>(utility::clamp<AccType, T>(acc));
499 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = out_vals;
502 input_it, weights_it, biases_it, output_it);
505 template <
typename T,
typename TW>
506 void depthwise_loop_generic_quantized(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
507 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
509 using AccType = int32_t;
511 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window, depth_multiplier);
513 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
515 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
516 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
517 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
518 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
520 Window execution_window = window;
521 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
523 Window win_input = execution_window;
527 Window win_weights = window;
528 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
533 Window win_output = window;
534 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
536 Iterator input_it(src, win_input);
537 Iterator weights_it(weights, win_weights);
538 Iterator output_it(dst, win_output);
539 Iterator biases_it{};
543 biases_it = Iterator(biases, win_weights);
548 std::vector<AccType> acc(depth_multiplier, 0);
549 std::vector<AccType> we_sum(depth_multiplier, 0);
552 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
553 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
554 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
556 auto weights_ptr = weights_it.ptr();
557 for(
size_t h = 0; h < run_info.weights_height; ++h)
559 int offs = input_offset;
560 for(
size_t w = 0; w < run_info.weights_width; ++
w)
562 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
563 const auto input_val = is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
565 for(
size_t m = 0;
m < depth_multiplier; ++
m)
567 const auto weights_val = *(
reinterpret_cast<TW *
>(weights_ptr +
m *
sizeof(T) + w * run_info.weights_stride_y));
568 acc.at(
m) += input_val * weights_val;
570 we_sum.at(
m) += weights_val;
573 offs += dilation.x() * run_info.input_stride_y;
577 weights_ptr += run_info.weights_stride_z;
578 input_offset += dilation.y() * run_info.input_stride_z;
581 for(
size_t m = 0;
m < depth_multiplier; ++
m)
583 acc.at(
m) -= in_sum * weights_qoffset;
584 acc.at(
m) -= we_sum.at(
m) * input_qoffset;
585 acc.at(
m) += k_offset;
589 acc.at(
m) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() +
m *
sizeof(int32_t)));
592 const int32_t out_mul = output_multiplier.at(
id.x() * depth_multiplier +
m);
593 const int32_t out_shift = output_shift.at(
id.x() * depth_multiplier +
m);
596 acc.at(
m) = saturating_doubling_high_mul(acc.at(
m) * (1 << (-out_shift)), out_mul) + output_qoffset;
600 acc.at(
m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(
m), out_mul), out_shift) + output_qoffset;
602 *(
reinterpret_cast<T *
>(output_it.ptr() +
m *
sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(
m)));
605 input_it, weights_it, biases_it, output_it);
608 template <
typename T,
typename TW>
609 void depthwise_loop_pow2_quantized_per_tensor(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
610 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
612 constexpr
int half_vec = vector_size / 2;
614 using AccType = int32_t;
616 using AccVectorTagType =
typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
617 using TagType =
typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
619 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window, depth_multiplier);
626 const auto upper =
wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
627 const auto zero =
wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
629 const auto out_mul = output_multiplier.at(0);
630 const auto out_shift = output_shift.at(0);
632 Window execution_window = window;
633 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
635 Window win_input = execution_window;
639 Window win_weights = window;
640 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
645 Window win_output = window;
646 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
648 Iterator input_it(src, win_input);
649 Iterator weights_it(weights, win_weights);
650 Iterator output_it(dst, win_output);
651 Iterator biases_it{};
655 biases_it = Iterator(biases, win_weights);
658 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
659 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
666 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
667 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
668 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
670 auto weights_ptr = weights_it.ptr();
671 for(
size_t h = 0; h < run_info.weights_height; ++h)
673 const int32_t current_h = input_z + h * dilation.y();
674 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
676 int offs = input_offset;
677 for(
size_t w = 0; w < run_info.weights_width; ++
w)
679 const int32_t current_w = input_y + w * dilation.x();
680 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
682 const auto input_8x8 =
wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
684 const auto input_no_offs =
wrapper::vsub(input_s16x8, input_qoffset_vec);
686 for(
size_t m = 0, i = 0;
m < depth_multiplier;
m += vector_size, ++i)
688 const auto weights_8x8 =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr +
m *
sizeof(T) + w * run_info.weights_stride_y));
690 const auto weights_no_offs =
wrapper::vsub(weights_s16x8, weights_qoffset_vec);
697 offs += dilation.x() * run_info.input_stride_y;
701 weights_ptr += run_info.weights_stride_z;
702 input_offset += dilation.y() * run_info.input_stride_z;
705 for(
size_t m = 0, i = 0;
m < depth_multiplier;
m += vector_size, ++i)
709 const auto bias_val0 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() +
m *
sizeof(int32_t)));
710 const auto bias_val1 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (
m + half_vec) *
sizeof(int32_t)));
718 acc0.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
719 acc1.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
723 acc0.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
724 acc1.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
733 if(std::is_same<T, uint8_t>::value)
743 input_it, weights_it, biases_it, output_it);
746 template <
typename T,
typename TW>
754 if(depth_multiplier == 1)
756 depthwise_loop_multiplier1_fp<T>(
src, weights, biases,
dst,
conv_info, dilation, window, has_biases);
760 depthwise_loop_generic_fp<T>(
src, weights, biases,
dst,
conv_info, dilation, depth_multiplier, window, has_biases);
765 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 766 template void run_depthwise_float<float16_t, float16_t>(
const ITensor *
src,
const ITensor *weights,
const ITensor *biases,
768 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 770 template <
typename T,
typename TW>
777 std::vector<int> output_multiplier;
778 std::vector<int> output_shift;
786 for(
size_t i = 1; i < weights->
info()->
dimension(channel_idx); ++i)
788 weights_scale.push_back(weights_scale.front());
792 for(
const auto &s : weights_scale)
794 int32_t out_mult = 0;
795 int32_t out_shift = 0;
796 const float multiplier = input_scale * s / output_scale;
799 output_multiplier.push_back(out_mult);
800 output_shift.push_back(out_shift);
803 if(depth_multiplier == 1)
805 depthwise_loop_multiplier1_quantized<T, TW>(
src, weights, biases,
dst,
conv_info, dilation, output_multiplier, output_shift, window, has_biases);
809 const bool is_pow2 = ((depth_multiplier & (depth_multiplier - 1)) == 0);
812 if(is_pow2 && is_quantized_per_tensor && depth_multiplier >= 8)
814 depthwise_loop_pow2_quantized_per_tensor<T, TW>(
src, weights, biases,
dst,
conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
818 depthwise_loop_generic_quantized<T, TW>(
src, weights, biases,
dst,
conv_info, dilation, depth_multiplier, output_multiplier, output_shift, window, has_biases);
void run_depthwise_quanitized8bit(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
const size_t weights_stride_y
template void run_depthwise_float< float, float >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
uint32x2_t vmovn(const uint64x2_t &a)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
uint8x16_t vloadq(const uint8_t *ptr)
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
void run_depthwise_float(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
const size_t input_height
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
void fill(U &&tensor, int seed, AssetsLibrary *library)
const size_t conv_pad_top
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
SimpleTensor< float > src
Copyright (c) 2017-2022 Arm Limited.
const size_t conv_pad_left
uint32x2_t vqmovn(const uint64x2_t &a)
unsigned int depth_multiplier
Multiplier to apply to input's depth to retrieve the output depth.
const size_t conv_stride_y
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
PadStrideInfo pad_stride_info
Convolution info (Pads, strides,...)
template void run_depthwise_quanitized8bit< int8_t, int8_t >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
int16x4_t vreinterpret(const uint16x4_t &a)
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Size2D dilation
Dilation, in elements, across x and y.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
uint8x8_t vgetlow(const uint8x16_t val)
Padding and stride information class.
void end(TokenStream &in, bool &valid)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
uint8x8_t vgethigh(const uint8x16_t val)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
const size_t input_max_offset
const uint32_t x_leftover_start
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type...
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
const size_t input_stride_y
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
Class for specifying the size of an image or rectangle.
Num samples, height, width, channels.
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
template void run_depthwise_quanitized8bit< uint8_t, int8_t >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
const size_t weights_width
const size_t weights_stride_z
const size_t weights_height
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
uint16x8_t vmovl(const uint8x8_t &a)
template void run_depthwise_quanitized8bit< uint8_t, uint8_t >(const ITensor *src, const ITensor *weights, const ITensor *biases, ITensor *dst, const Window &window, bool has_biases, const ConvolutionInfo &info)
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
const size_t conv_stride_x
Describe a multidimensional execution window.
const size_t num_read_elements_per_iteration
const size_t input_stride_z