50 constexpr
auto dim_manual_loop = Window::Dimension(0, 0, 0);
51 constexpr
auto dim_single_unit_step = Window::Dimension(0, 1, 1);
52 constexpr
size_t vector_size = 8;
54 struct DepthwiseConvolutionRunInfo
76 DepthwiseConvolutionRunInfo(
const ITensorInfo &
input,
const ITensorInfo &weights,
const PadStrideInfo &
conv_info,
const Window &
w, uint32_t depth_multiplier = 1)
78 x_start(w.x().start()),
80 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
81 x_leftover_start(
std::max(static_cast<int32_t>(w.x().
end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
82 input_stride_y(input.strides_in_bytes().y()),
83 input_stride_z(input.strides_in_bytes().z()),
84 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
85 weights_width(weights.dimension(width_idx)),
86 weights_height(weights.dimension(height_idx)),
87 weights_stride_y(weights.strides_in_bytes().y()),
88 weights_stride_z(weights.strides_in_bytes().z()),
89 conv_stride_x(conv_info.stride().first),
90 conv_stride_y(conv_info.stride().second),
91 conv_pad_left(conv_info.pad_left()),
92 conv_pad_top(conv_info.pad_top()),
93 input_height(input.dimension(height_idx)),
94 input_width(input.dimension(width_idx)),
95 input_depth(input.dimension(channel_idx))
100 inline int32x4_t saturating_doubling_high_mul(
const int32x4_t &a,
const int32_t &
b)
102 return vqrdmulhq_n_s32(a, b);
105 inline int32_t saturating_doubling_high_mul(
const int32_t &a,
const int32_t &b)
107 return vget_lane_s32(vqrdmulh_n_s32(vdup_n_s32(a), b), 0);
110 inline int32x4_t rounding_divide_by_exp2(
const int32x4_t &x,
const int exponent)
112 const int32x4_t shift = vdupq_n_s32(-exponent);
113 const int32x4_t fixup = vshrq_n_s32(vandq_s32(x, shift), 31);
114 const int32x4_t fixed = vqaddq_s32(x, fixup);
115 return vrshlq_s32(fixed, shift);
118 inline int32x2_t rounding_divide_by_exp2(
const int32x2_t &x,
const int exponent)
120 const int32x2_t shift = vdup_n_s32(-exponent);
121 const int32x2_t fixup = vshr_n_s32(vand_s32(x, shift), 31);
122 const int32x2_t fixed = vqadd_s32(x, fixup);
123 return vrshl_s32(fixed, shift);
126 inline int32_t rounding_divide_by_exp2(
const int32_t &x,
const int exponent)
128 const int32x2_t xs = vdup_n_s32(x);
129 return vget_lane_s32(rounding_divide_by_exp2(xs, exponent), 0);
132 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t
w, uint32_t h,
const DepthwiseConvolutionRunInfo &run_info,
const Size2D &dilation)
134 const int32_t current_h = base_h + h * dilation.y();
135 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
137 const int32_t current_w = base_w + w * dilation.x();
138 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
140 return is_valid_h && is_valid_w;
143 template <
typename T>
144 void depthwise_loop_multiplier1_fp(
const ITensor *
src,
const ITensor *weights,
const ITensor *biases, ITensor *
dst,
const PadStrideInfo &
conv_info,
145 const Size2D &dilation,
const Window &window,
bool has_biases)
147 constexpr
auto element_per_vector = vector_size /
sizeof(T);
149 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
151 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window);
153 const VectorType zero_vector =
wrapper::vdup_n(static_cast<T>(0), TagType{});
155 Window execution_window = window;
156 execution_window.set(
Window::DimX, dim_single_unit_step);
158 Window win_input = window;
163 Window win_weights = win_input;
166 Window win_output = window;
169 Iterator input_it(src, win_input);
170 Iterator weights_it(weights, win_weights);
171 Iterator output_it(dst, win_output);
172 Iterator biases_it{};
176 biases_it = Iterator(biases, win_weights);
181 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
182 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
183 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
185 auto const base_weights_ptr = weights_it.ptr();
186 uint32_t x = run_info.x_start;
188 for(; x < run_info.x_leftover_start; x += run_info.x_step)
190 VectorType acc = zero_vector;
191 auto weights_ptr = base_weights_ptr;
192 int64_t input_offset = base_input_offset;
194 for(uint32_t h = 0; h < run_info.weights_height; ++h)
196 int64_t offs = input_offset + x *
sizeof(T);
197 for(uint32_t w = 0; w < run_info.weights_width; ++
w)
199 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
200 const auto input_vals = is_valid_region ?
201 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
203 const auto weights_vals =
wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
206 offs += dilation.x() * run_info.input_stride_y;
209 weights_ptr += run_info.weights_stride_z;
210 input_offset += dilation.y() * run_info.input_stride_z;
215 const auto biases_vals =
wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
222 for(; x < run_info.x_end; ++x)
224 auto acc_scalar = T{ 0 };
225 auto weights_ptr = base_weights_ptr;
226 int64_t input_offset = base_input_offset;
228 for(
size_t h = 0; h < run_info.weights_height; ++h)
230 int64_t offs = input_offset + x *
sizeof(T);
231 for(
size_t w = 0; w < run_info.weights_width; ++
w)
233 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
234 const auto input_vals = is_valid_region ? *
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
235 const auto weights_vals = *(
reinterpret_cast<T *
>(weights_ptr + w * run_info.weights_stride_y) + x);
237 acc_scalar += (input_vals * weights_vals);
239 offs += dilation.x() * run_info.input_stride_y;
242 weights_ptr += run_info.weights_stride_z;
243 input_offset += dilation.y() * run_info.input_stride_z;
248 const auto biases_vals = *(
reinterpret_cast<T *
>(biases_it.ptr()) + x);
249 acc_scalar += biases_vals;
251 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = acc_scalar;
254 input_it, weights_it, biases_it, output_it);
257 template <
typename T>
258 void depthwise_loop_generic_fp(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
259 const Size2D &dilation,
unsigned int depth_multiplier,
const Window &window,
bool has_biases)
261 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window, depth_multiplier);
263 Window execution_window = window;
264 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
266 Window win_input = execution_window;
267 win_input.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
271 Window win_weights = window;
272 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
277 Window win_output = window;
278 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
280 Iterator input_it(src, win_input);
281 Iterator weights_it(weights, win_weights);
282 Iterator output_it(dst, win_output);
283 Iterator biases_it{};
287 biases_it = Iterator(biases, win_weights);
292 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
294 const int input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
295 const int input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
296 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
298 auto weights_ptr = weights_it.ptr();
299 for(
size_t h = 0; h < run_info.weights_height; ++h)
301 int offs = input_offset;
302 for(
size_t w = 0; w < run_info.weights_width; ++
w)
304 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
305 const auto input_val = is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
307 for(
size_t m = 0; m < depth_multiplier; ++m)
309 const auto weights_val = *(
reinterpret_cast<T *
>(weights_ptr + m *
sizeof(T) + w * run_info.weights_stride_y));
313 offs += dilation.x() * run_info.input_stride_y;
316 weights_ptr += run_info.weights_stride_z;
317 input_offset += dilation.y() * run_info.input_stride_z;
322 for(
size_t m = 0; m < depth_multiplier; ++m)
324 const auto biases_val = *(
reinterpret_cast<T *
>(biases_it.ptr() + m *
sizeof(T)));
325 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) = acc.at(m) + biases_val;
330 for(
size_t m = 0; m < depth_multiplier; ++m)
332 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) = acc.at(m);
336 input_it, weights_it, biases_it, output_it);
339 template <
typename T,
typename TW>
340 void depthwise_loop_multiplier1_quantized(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
341 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
344 constexpr
auto element_per_vector = vector_size /
sizeof(T);
346 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
347 using AccType = int32_t;
348 using AccArrayType = std::array<AccType, element_per_vector>;
350 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
351 const auto out_of_bound_vector =
wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
353 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window);
355 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
356 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
357 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
358 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
360 Window execution_window = window;
361 execution_window.set(
Window::DimX, dim_single_unit_step);
363 Window win_input = window;
368 Window win_weights = win_input;
371 Window win_output = window;
374 Iterator input_it(src, win_input);
375 Iterator weights_it(weights, win_weights);
376 Iterator output_it(dst, win_output);
377 Iterator biases_it{};
381 biases_it = Iterator(biases, win_weights);
386 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
387 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
388 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
389 auto const base_weights_ptr = weights_it.ptr();
390 size_t x = run_info.x_start;
392 for(; x < run_info.x_leftover_start; x += run_info.x_step)
395 AccArrayType in_sum{};
396 AccArrayType we_sum{};
398 auto weights_ptr = base_weights_ptr;
399 auto input_offset = base_input_offset;
401 for(
size_t h = 0; h < run_info.weights_height; ++h)
403 int64_t offs = input_offset + x *
sizeof(T);
404 for(
size_t w = 0; w < run_info.weights_width; ++
w)
406 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
407 const auto input_vals = is_valid_region ?
408 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
410 const auto weights_vals =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
412 for(
size_t i = 0; i < element_per_vector; ++i)
414 acc.at(i) += input_vals[i] * weights_vals[i];
415 in_sum.at(i) += input_vals[i];
416 we_sum.at(i) += weights_vals[i];
419 offs += dilation.x() * run_info.input_stride_y;
422 weights_ptr += run_info.weights_stride_z;
423 input_offset += dilation.y() * run_info.input_stride_z;
427 for(
size_t i = 0; i < element_per_vector; ++i)
429 acc.at(i) -= in_sum.at(i) * weights_qoffset;
430 acc.at(i) -= we_sum.at(i) * input_qoffset;
431 acc.at(i) += k_offset;
435 acc.at(i) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + i *
sizeof(int32_t)) + x);
438 const int32_t out_mul = output_multiplier.at(x + i);
439 const int32_t out_shift = output_shift.at(x + i);
442 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
446 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
448 out_vals[i] =
static_cast<T
>(utility::clamp<AccType, T>(acc.at(i)));
455 for(; x < run_info.x_end; ++x)
461 auto weights_ptr = base_weights_ptr;
462 auto input_offset = base_input_offset;
464 for(
size_t h = 0; h < run_info.weights_height; ++h)
466 int64_t offs = input_offset + x *
sizeof(T);
467 for(
size_t w = 0; w < run_info.weights_width; ++
w)
469 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
470 const auto input_val = is_valid_region ?
471 *
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
473 const auto weights_val = *(
reinterpret_cast<TW *
>(weights_ptr + w * run_info.weights_stride_y) + x);
475 acc += input_val * weights_val;
477 we_sum += weights_val;
479 offs += dilation.x() * run_info.input_stride_y;
482 weights_ptr += run_info.weights_stride_z;
483 input_offset += dilation.y() * run_info.input_stride_z;
488 acc -= in_sum * weights_qoffset;
489 acc -= we_sum * input_qoffset;
494 acc += *(
reinterpret_cast<int32_t *
>(biases_it.ptr()) + x);
497 const int32_t out_mul = output_multiplier.at(x);
498 const int32_t out_shift = output_shift.at(x);
502 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
506 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
509 out_vals =
static_cast<T
>(utility::clamp<AccType, T>(acc));
510 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = out_vals;
513 input_it, weights_it, biases_it, output_it);
516 template <
typename T,
typename TW>
517 void depthwise_loop_generic_quantized(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
518 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
520 using AccType = int32_t;
522 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window, depth_multiplier);
524 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), src->info()->data_type(), src->info()->quantization_info()).get<T>();
526 const int32_t input_qoffset = src->info()->quantization_info().uniform().offset;
527 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
528 const int32_t output_qoffset = dst->info()->quantization_info().uniform().offset;
529 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
531 Window execution_window = window;
532 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
534 Window win_input = execution_window;
538 Window win_weights = window;
539 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
544 Window win_output = window;
545 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
547 Iterator input_it(src, win_input);
548 Iterator weights_it(weights, win_weights);
549 Iterator output_it(dst, win_output);
550 Iterator biases_it{};
554 biases_it = Iterator(biases, win_weights);
559 std::vector<AccType> acc(depth_multiplier, 0);
560 std::vector<AccType> we_sum(depth_multiplier, 0);
563 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
564 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
565 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
567 auto weights_ptr = weights_it.ptr();
568 for(
size_t h = 0; h < run_info.weights_height; ++h)
570 int offs = input_offset;
571 for(
size_t w = 0; w < run_info.weights_width; ++
w)
573 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
574 const auto input_val = is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
576 for(
size_t m = 0; m < depth_multiplier; ++m)
578 const auto weights_val = *(
reinterpret_cast<TW *
>(weights_ptr + m *
sizeof(T) + w * run_info.weights_stride_y));
579 acc.at(m) += input_val * weights_val;
581 we_sum.at(m) += weights_val;
584 offs += dilation.x() * run_info.input_stride_y;
588 weights_ptr += run_info.weights_stride_z;
589 input_offset += dilation.y() * run_info.input_stride_z;
592 for(
size_t m = 0; m < depth_multiplier; ++m)
594 acc.at(m) -= in_sum * weights_qoffset;
595 acc.at(m) -= we_sum.at(m) * input_qoffset;
596 acc.at(m) += k_offset;
600 acc.at(m) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + m *
sizeof(int32_t)));
603 const int32_t out_mul = output_multiplier.at(
id.x() * depth_multiplier + m);
604 const int32_t out_shift = output_shift.at(
id.x() * depth_multiplier + m);
607 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
611 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
613 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
616 input_it, weights_it, biases_it, output_it);
619 template <
typename T,
typename TW>
620 void depthwise_loop_pow2_quantized_per_tensor(
const ITensor *src,
const ITensor *weights,
const ITensor *biases, ITensor *dst,
const PadStrideInfo &conv_info,
621 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
623 constexpr
int half_vec = vector_size / 2;
625 using AccType = int32_t;
627 using AccVectorTagType =
typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
628 using TagType =
typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
630 const auto run_info = DepthwiseConvolutionRunInfo(*src->info(), *weights->info(),
conv_info, window, depth_multiplier);
637 const auto upper =
wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
638 const auto zero =
wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
640 const auto out_mul = output_multiplier.at(0);
641 const auto out_shift = output_shift.at(0);
643 Window execution_window = window;
644 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
646 Window win_input = execution_window;
650 Window win_weights = window;
651 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
656 Window win_output = window;
657 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
659 Iterator input_it(src, win_input);
660 Iterator weights_it(weights, win_weights);
661 Iterator output_it(dst, win_output);
662 Iterator biases_it{};
666 biases_it = Iterator(biases, win_weights);
669 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
670 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
677 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
678 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
679 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
681 auto weights_ptr = weights_it.ptr();
682 for(
size_t h = 0; h < run_info.weights_height; ++h)
684 const int32_t current_h = input_z + h * dilation.y();
685 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
687 int offs = input_offset;
688 for(
size_t w = 0; w < run_info.weights_width; ++
w)
690 const int32_t current_w = input_y + w * dilation.x();
691 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
693 const auto input_8x8 =
wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
695 const auto input_no_offs =
wrapper::vsub(input_s16x8, input_qoffset_vec);
697 for(
size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
699 const auto weights_8x8 =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m *
sizeof(T) + w * run_info.weights_stride_y));
701 const auto weights_no_offs =
wrapper::vsub(weights_s16x8, weights_qoffset_vec);
708 offs += dilation.x() * run_info.input_stride_y;
712 weights_ptr += run_info.weights_stride_z;
713 input_offset += dilation.y() * run_info.input_stride_z;
716 for(
size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
720 const auto bias_val0 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m *
sizeof(int32_t)));
721 const auto bias_val1 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) *
sizeof(int32_t)));
729 acc0.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
730 acc1.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
734 acc0.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
735 acc1.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
744 if(std::is_same<T, uint8_t>::value)
754 input_it, weights_it, biases_it, output_it);
757 Status validate_arguments(
const ITensorInfo *src,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *dst,
const ConvolutionInfo &
info)
764 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (info.dilation.x() - 1) > src->dimension(1) + info.pad_stride_info.pad_left() + info.pad_stride_info.pad_right());
765 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (info.dilation.y() - 1) > src->dimension(2) + info.pad_stride_info.pad_top() + info.pad_stride_info.pad_bottom());
780 if(biases !=
nullptr)
795 if(dst->total_size() != 0)
814 _has_biases = (biases !=
nullptr);
824 for(
size_t i = 1; i < weights->
dimension(channel_idx); ++i)
826 weights_scale.push_back(weights_scale.front());
830 for(
const auto &s : weights_scale)
832 int32_t out_mult = 0;
833 int32_t out_shift = 0;
834 const float multiplier = input_scale * s / output_scale;
837 _output_multiplier.push_back(out_mult);
838 _output_shift.push_back(out_shift);
845 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, uint8_t>;
848 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
853 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<uint8_t, int8_t>;
857 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<int8_t, int8_t>;
860 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 862 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float16_t, float16_t>;
864 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 866 _func = &CpuDepthwiseConv2dNativeKernel::run_depthwise<float, float>;
877 ICpuKernel::configure(win);
886 template <
typename T,
typename TW, CpuDepthwiseConv2dNativeKernel::FloatEnalber<T>>
887 void CpuDepthwiseConv2dNativeKernel::run_depthwise(
const ITensor *
src,
const ITensor *weights,
const ITensor *biases,
893 if(_depth_multiplier == 1)
895 depthwise_loop_multiplier1_fp<T>(
src, weights, biases,
dst, _conv_info, _dilation, window, has_biases);
899 depthwise_loop_generic_fp<T>(
src, weights, biases,
dst, _conv_info, _dilation, _depth_multiplier, window, has_biases);
903 template <
typename T,
typename TW, CpuDepthwiseConv2dNativeKernel::Quantized8bitEnalber<T>>
904 void CpuDepthwiseConv2dNativeKernel::run_depthwise(
const ITensor *src,
const ITensor *weights,
const ITensor *biases,
910 if(_depth_multiplier == 1)
912 depthwise_loop_multiplier1_quantized<T, TW>(
src, weights, biases,
dst, _conv_info, _dilation, _output_multiplier, _output_shift, window, has_biases);
916 const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
919 if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
921 depthwise_loop_pow2_quantized_per_tensor<T, TW>(
src, weights, biases,
dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
925 depthwise_loop_generic_quantized<T, TW>(
src, weights, biases,
dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift, window, has_biases);
941 (this->*_func)(src, weights, biases, dst, window, _has_biases);
946 return "CpuDepthwiseConv2dNativeKernel";
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
const size_t weights_stride_z
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Traits defined on Arm® Neon™ vectors.
const Window & window() const
The maximum window the kernel can be executed on.
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
const size_t weights_height
uint32x2_t vmovn(const uint64x2_t &a)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
const size_t conv_pad_left
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
const size_t input_stride_y
const size_t weights_stride_y
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const DataLayout data_layout
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
const size_t input_stride_z
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
const size_t num_read_elements_per_iteration
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
unsigned int depth_multiplier
Multiplier to apply to input's depth to retrieve the output depth.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
library fill(src, distribution, 0)
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
PadStrideInfo pad_stride_info
Convolution info (Pads, strides,...)
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
const size_t input_max_offset
int16x4_t vreinterpret(const uint16x4_t &a)
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
Size2D dilation
Dilation, in elements, across x and y.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
uint8x8_t vgetlow(const uint8x16_t val)
void end(TokenStream &in, bool &valid)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
const size_t weights_width
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
const size_t conv_stride_x
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ConvolutionInfo &info)
Initialize the function's source, destination and parameters.
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Information about executing thread and CPU.
const size_t conv_pad_top
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type...
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
const size_t input_height
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ConvolutionInfo &info)
Static function to check if given info will lead to a valid configuration.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
const size_t conv_stride_y
uint16x8_t vmovl(const uint8x8_t &a)
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Describe a multidimensional execution window.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
const uint32_t x_leftover_start