31 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp" 51 constexpr
auto dim_manual_loop = Window::Dimension(0, 0, 0);
52 constexpr
auto dim_single_unit_step = Window::Dimension(0, 1, 1);
53 constexpr
size_t vector_size = 8;
55 struct DepthwiseConvolutionRunInfo
77 DepthwiseConvolutionRunInfo(
const ITensorInfo &
input,
const ITensorInfo &weights,
const PadStrideInfo &
conv_info,
const Window &
w, uint32_t depth_multiplier = 1)
101 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t
w, uint32_t h,
const DepthwiseConvolutionRunInfo &run_info,
const Size2D &dilation)
103 const int32_t current_h = base_h + h * dilation.y();
104 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
106 const int32_t current_w = base_w +
w * dilation.x();
107 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
109 return is_valid_h && is_valid_w;
112 template <
typename T>
113 void depthwise_loop_multiplier1_fp(
const ITensor *
input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &
conv_info,
114 const Size2D &dilation,
const Window &window,
bool has_biases)
116 constexpr
auto element_per_vector = vector_size /
sizeof(T);
118 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
120 const auto run_info = DepthwiseConvolutionRunInfo(*
input->info(), *weights->info(),
conv_info, window);
122 const VectorType zero_vector =
wrapper::vdup_n(static_cast<T>(0), TagType{});
124 Window execution_window = window;
125 execution_window.set(
Window::DimX, dim_single_unit_step);
127 Window win_input = window;
132 Window win_weights = win_input;
135 Window win_output = window;
138 Iterator input_it(
input, win_input);
139 Iterator weights_it(weights, win_weights);
140 Iterator output_it(output, win_output);
141 Iterator biases_it{};
145 biases_it = Iterator(biases, win_weights);
150 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
151 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
152 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
154 auto const base_weights_ptr = weights_it.ptr();
155 uint32_t x = run_info.x_start;
157 for(; x < run_info.x_leftover_start; x += run_info.x_step)
159 VectorType acc = zero_vector;
160 auto weights_ptr = base_weights_ptr;
161 int64_t input_offset = base_input_offset;
163 for(uint32_t h = 0; h < run_info.weights_height; ++h)
165 int64_t offs = input_offset + x *
sizeof(T);
166 for(uint32_t
w = 0;
w < run_info.weights_width; ++
w)
168 const bool is_valid_region = is_valid_input_region(input_y, input_z,
w, h, run_info, dilation);
169 const auto input_vals = is_valid_region ?
170 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
172 const auto weights_vals =
wrapper::vload(reinterpret_cast<T *>(weights_ptr +
w * run_info.weights_stride_y) + x);
175 offs += dilation.x() * run_info.input_stride_y;
178 weights_ptr += run_info.weights_stride_z;
179 input_offset += dilation.y() * run_info.input_stride_z;
184 const auto biases_vals =
wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
191 for(; x < run_info.x_end; ++x)
193 auto acc_scalar = T{ 0 };
194 auto weights_ptr = base_weights_ptr;
195 int64_t input_offset = base_input_offset;
197 for(
size_t h = 0; h < run_info.weights_height; ++h)
199 int64_t offs = input_offset + x *
sizeof(T);
200 for(
size_t w = 0;
w < run_info.weights_width; ++
w)
202 const bool is_valid_region = is_valid_input_region(input_y, input_z,
w, h, run_info, dilation);
203 const auto input_vals = is_valid_region ? *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
204 const auto weights_vals = *(reinterpret_cast<T *>(weights_ptr +
w * run_info.weights_stride_y) + x);
206 acc_scalar += (input_vals * weights_vals);
208 offs += dilation.x() * run_info.input_stride_y;
211 weights_ptr += run_info.weights_stride_z;
212 input_offset += dilation.y() * run_info.input_stride_z;
217 const auto biases_vals = *(reinterpret_cast<T *>(biases_it.ptr()) + x);
218 acc_scalar += biases_vals;
220 *(reinterpret_cast<T *>(output_it.ptr()) + x) = acc_scalar;
223 input_it, weights_it, biases_it, output_it);
226 template <
typename T>
227 void depthwise_loop_generic_fp(
const ITensor *
input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &
conv_info,
228 const Size2D &dilation,
unsigned int depth_multiplier,
const Window &window,
bool has_biases)
230 const auto run_info = DepthwiseConvolutionRunInfo(*
input->info(), *weights->info(),
conv_info, window, depth_multiplier);
232 Window execution_window = window;
233 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
235 Window win_input = execution_window;
236 win_input.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
240 Window win_weights = window;
241 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
246 Window win_output = window;
247 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
249 Iterator input_it(
input, win_input);
250 Iterator weights_it(weights, win_weights);
251 Iterator output_it(output, win_output);
252 Iterator biases_it{};
256 biases_it = Iterator(biases, win_weights);
261 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
263 const int input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
264 const int input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
265 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
267 auto weights_ptr = weights_it.ptr();
268 for(
size_t h = 0; h < run_info.weights_height; ++h)
270 int offs = input_offset;
271 for(
size_t w = 0;
w < run_info.weights_width; ++
w)
273 const bool is_valid_region = is_valid_input_region(input_y, input_z,
w, h, run_info, dilation);
274 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
276 for(
size_t m = 0; m < depth_multiplier; ++m)
278 const auto weights_val = *(reinterpret_cast<T *>(weights_ptr + m *
sizeof(T) +
w * run_info.weights_stride_y));
282 offs += dilation.x() * run_info.input_stride_y;
285 weights_ptr += run_info.weights_stride_z;
286 input_offset += dilation.y() * run_info.input_stride_z;
291 for(
size_t m = 0; m < depth_multiplier; ++m)
293 const auto biases_val = *(reinterpret_cast<T *>(biases_it.ptr() + m *
sizeof(T)));
294 *(reinterpret_cast<T *>(output_it.ptr() + m *
sizeof(T))) = acc.at(m) + biases_val;
299 for(
size_t m = 0; m < depth_multiplier; ++m)
301 *(reinterpret_cast<T *>(output_it.ptr() + m *
sizeof(T))) = acc.at(m);
305 input_it, weights_it, biases_it, output_it);
308 template <
typename T,
typename TW>
309 void depthwise_loop_multiplier1_quantized(
const ITensor *
input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &
conv_info,
310 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
312 constexpr
auto element_per_vector = vector_size /
sizeof(T);
314 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
315 using AccType = int32_t;
316 using AccArrayType = std::array<AccType, element_per_vector>;
318 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0),
input->info()->data_type(),
input->info()->quantization_info()).get<T>();
319 const auto out_of_bound_vector =
wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
321 const auto run_info = DepthwiseConvolutionRunInfo(*
input->info(), *weights->info(),
conv_info, window);
323 const int32_t input_qoffset =
input->info()->quantization_info().uniform().offset;
324 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
325 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
326 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
328 Window execution_window = window;
329 execution_window.set(
Window::DimX, dim_single_unit_step);
331 Window win_input = window;
336 Window win_weights = win_input;
339 Window win_output = window;
342 Iterator input_it(
input, win_input);
343 Iterator weights_it(weights, win_weights);
344 Iterator output_it(output, win_output);
345 Iterator biases_it{};
349 biases_it = Iterator(biases, win_weights);
354 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
355 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
356 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
357 auto const base_weights_ptr = weights_it.ptr();
358 size_t x = run_info.x_start;
360 for(; x < run_info.x_leftover_start; x += run_info.x_step)
363 AccArrayType in_sum{};
364 AccArrayType we_sum{};
366 auto weights_ptr = base_weights_ptr;
367 auto input_offset = base_input_offset;
369 for(
size_t h = 0; h < run_info.weights_height; ++h)
371 int64_t offs = input_offset + x *
sizeof(T);
372 for(
size_t w = 0;
w < run_info.weights_width; ++
w)
374 const bool is_valid_region = is_valid_input_region(input_y, input_z,
w, h, run_info, dilation);
375 const auto input_vals = is_valid_region ?
376 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
378 const auto weights_vals =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr +
w * run_info.weights_stride_y) + x);
380 for(
size_t i = 0; i < element_per_vector; ++i)
382 acc.at(i) += input_vals[i] * weights_vals[i];
383 in_sum.at(i) += input_vals[i];
384 we_sum.at(i) += weights_vals[i];
387 offs += dilation.x() * run_info.input_stride_y;
390 weights_ptr += run_info.weights_stride_z;
391 input_offset += dilation.y() * run_info.input_stride_z;
395 for(
size_t i = 0; i < element_per_vector; ++i)
397 acc.at(i) -= in_sum.at(i) * weights_qoffset;
398 acc.at(i) -= we_sum.at(i) * input_qoffset;
399 acc.at(i) += k_offset;
403 acc.at(i) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + i *
sizeof(int32_t)) + x);
406 const int32_t out_mul = output_multiplier.at(x + i);
407 const int32_t out_shift = output_shift.at(x + i);
410 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
414 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
416 out_vals[i] = static_cast<T>(utility::clamp<AccType, T>(acc.at(i)));
423 for(; x < run_info.x_end; ++x)
429 auto weights_ptr = base_weights_ptr;
430 auto input_offset = base_input_offset;
432 for(
size_t h = 0; h < run_info.weights_height; ++h)
434 int64_t offs = input_offset + x *
sizeof(T);
435 for(
size_t w = 0;
w < run_info.weights_width; ++
w)
437 const bool is_valid_region = is_valid_input_region(input_y, input_z,
w, h, run_info, dilation);
438 const auto input_val = is_valid_region ?
439 *reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
441 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr +
w * run_info.weights_stride_y) + x);
443 acc += input_val * weights_val;
445 we_sum += weights_val;
447 offs += dilation.x() * run_info.input_stride_y;
450 weights_ptr += run_info.weights_stride_z;
451 input_offset += dilation.y() * run_info.input_stride_z;
456 acc -= in_sum * weights_qoffset;
457 acc -= we_sum * input_qoffset;
462 acc += *(reinterpret_cast<int32_t *>(biases_it.ptr()) + x);
465 const int32_t out_mul = output_multiplier.at(x);
466 const int32_t out_shift = output_shift.at(x);
470 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
474 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
477 out_vals = static_cast<T>(utility::clamp<AccType, T>(acc));
478 *(reinterpret_cast<T *>(output_it.ptr()) + x) = out_vals;
481 input_it, weights_it, biases_it, output_it);
484 template <
typename T,
typename TW>
485 void depthwise_loop_generic_quantized(
const ITensor *
input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &
conv_info,
486 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
488 using AccType = int32_t;
490 const auto run_info = DepthwiseConvolutionRunInfo(*
input->info(), *weights->info(),
conv_info, window, depth_multiplier);
492 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0),
input->info()->data_type(),
input->info()->quantization_info()).get<T>();
494 const int32_t input_qoffset =
input->info()->quantization_info().uniform().offset;
495 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
496 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
497 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
499 Window execution_window = window;
500 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
502 Window win_input = execution_window;
506 Window win_weights = window;
507 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
512 Window win_output = window;
513 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
515 Iterator input_it(
input, win_input);
516 Iterator weights_it(weights, win_weights);
517 Iterator output_it(output, win_output);
518 Iterator biases_it{};
522 biases_it = Iterator(biases, win_weights);
527 std::vector<AccType> acc(depth_multiplier, 0);
528 std::vector<AccType> we_sum(depth_multiplier, 0);
531 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
532 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
533 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
535 auto weights_ptr = weights_it.ptr();
536 for(
size_t h = 0; h < run_info.weights_height; ++h)
538 int offs = input_offset;
539 for(
size_t w = 0;
w < run_info.weights_width; ++
w)
541 const bool is_valid_region = is_valid_input_region(input_y, input_z,
w, h, run_info, dilation);
542 const auto input_val = is_valid_region ? *(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
544 for(
size_t m = 0; m < depth_multiplier; ++m)
546 const auto weights_val = *(reinterpret_cast<TW *>(weights_ptr + m *
sizeof(T) +
w * run_info.weights_stride_y));
547 acc.at(m) += input_val * weights_val;
549 we_sum.at(m) += weights_val;
552 offs += dilation.x() * run_info.input_stride_y;
556 weights_ptr += run_info.weights_stride_z;
557 input_offset += dilation.y() * run_info.input_stride_z;
560 for(
size_t m = 0; m < depth_multiplier; ++m)
562 acc.at(m) -= in_sum * weights_qoffset;
563 acc.at(m) -= we_sum.at(m) * input_qoffset;
564 acc.at(m) += k_offset;
568 acc.at(m) += *(reinterpret_cast<int32_t *>(biases_it.ptr() + m *
sizeof(int32_t)));
571 const int32_t out_mul = output_multiplier.at(
id.x() * depth_multiplier + m);
572 const int32_t out_shift = output_shift.at(
id.x() * depth_multiplier + m);
575 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
579 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
581 *(reinterpret_cast<T *>(output_it.ptr() + m *
sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
584 input_it, weights_it, biases_it, output_it);
587 template <
typename T,
typename TW>
588 void depthwise_loop_pow2_quantized_per_tensor(
const ITensor *
input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &
conv_info,
589 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
591 constexpr
int half_vec = vector_size / 2;
593 using AccType = int32_t;
595 using AccVectorTagType =
typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
596 using TagType =
typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
598 const auto run_info = DepthwiseConvolutionRunInfo(*
input->info(), *weights->info(),
conv_info, window, depth_multiplier);
605 const auto upper =
wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
606 const auto zero =
wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
608 const auto out_mul = output_multiplier.at(0);
609 const auto out_shift = output_shift.at(0);
611 Window execution_window = window;
612 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
614 Window win_input = execution_window;
618 Window win_weights = window;
619 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
624 Window win_output = window;
625 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
627 Iterator input_it(
input, win_input);
628 Iterator weights_it(weights, win_weights);
629 Iterator output_it(output, win_output);
630 Iterator biases_it{};
634 biases_it = Iterator(biases, win_weights);
637 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
638 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
645 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
646 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
647 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
649 auto weights_ptr = weights_it.ptr();
650 for(
size_t h = 0; h < run_info.weights_height; ++h)
652 const int32_t current_h = input_z + h * dilation.y();
653 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
655 int offs = input_offset;
656 for(
size_t w = 0;
w < run_info.weights_width; ++
w)
658 const int32_t current_w = input_y +
w * dilation.x();
659 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
661 const auto input_8x8 =
wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
663 const auto input_no_offs =
wrapper::vsub(input_s16x8, input_qoffset_vec);
665 for(
size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
667 const auto weights_8x8 =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m *
sizeof(T) +
w * run_info.weights_stride_y));
669 const auto weights_no_offs =
wrapper::vsub(weights_s16x8, weights_qoffset_vec);
676 offs += dilation.x() * run_info.input_stride_y;
680 weights_ptr += run_info.weights_stride_z;
681 input_offset += dilation.y() * run_info.input_stride_z;
684 for(
size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
688 const auto bias_val0 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m *
sizeof(int32_t)));
689 const auto bias_val1 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) *
sizeof(int32_t)));
697 acc0.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
698 acc1.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
702 acc0.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
703 acc1.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
712 if(std::is_same<T, uint8_t>::value)
722 input_it, weights_it, biases_it, output_it);
725 Status
validate_arguments(
const ITensorInfo *
input,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *output,
const ConvolutionInfo &
info)
748 if(biases !=
nullptr)
763 if(output->total_size() != 0)
775 : _func(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
784 _conv_info =
info.pad_stride_info;
785 _depth_multiplier =
info.depth_multiplier;
786 _dilation =
info.dilation;
787 _has_biases = (biases !=
nullptr);
791 const auto input_scale =
input->quantization_info().uniform().scale;
797 for(
size_t i = 1; i < weights->
dimension(channel_idx); ++i)
799 weights_scale.push_back(weights_scale.front());
803 for(
const auto &s : weights_scale)
805 int32_t out_mult = 0;
806 int32_t out_shift = 0;
807 const float multiplier = input_scale * s / output_scale;
810 _output_multiplier.push_back(out_mult);
811 _output_shift.push_back(out_shift);
818 _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, uint8_t>;
821 _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
826 _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<uint8_t, int8_t>;
830 _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<int8_t, int8_t>;
833 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 835 _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float16_t, float16_t>;
837 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 839 _func = &CpuDepthwiseConvolutionNativeKernel::run_depthwise<float, float>;
850 ICpuKernel::configure(win);
859 template <
typename T,
typename TW, CpuDepthwiseConvolutionNativeKernel::FloatEnalber<T>>
860 void CpuDepthwiseConvolutionNativeKernel::run_depthwise(
const ITensor *
src,
const ITensor *weights,
const ITensor *biases,
866 if(_depth_multiplier == 1)
868 depthwise_loop_multiplier1_fp<T>(
src, weights, biases,
dst, _conv_info, _dilation,
window, has_biases);
872 depthwise_loop_generic_fp<T>(
src, weights, biases,
dst, _conv_info, _dilation, _depth_multiplier,
window, has_biases);
876 template <
typename T,
typename TW, CpuDepthwiseConvolutionNativeKernel::Quantized8bitEnalber<T>>
877 void CpuDepthwiseConvolutionNativeKernel::run_depthwise(
const ITensor *
src,
const ITensor *weights,
const ITensor *biases,
878 ITensor *
dst,
const Window &window,
bool has_biases)
883 if(_depth_multiplier == 1)
885 depthwise_loop_multiplier1_quantized<T, TW>(
src, weights, biases,
dst, _conv_info, _dilation, _output_multiplier, _output_shift,
window, has_biases);
889 const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
892 if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
894 depthwise_loop_pow2_quantized_per_tensor<T, TW>(
src, weights, biases,
dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift,
window, has_biases);
898 depthwise_loop_generic_quantized<T, TW>(
src, weights, biases,
dst, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift,
window, has_biases);
914 (this->*_func)(
src, weights, biases,
dst,
window, _has_biases);
const size_t weights_stride_z
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Traits defined on Arm® Neon™ vectors.
const Window & window() const
The maximum window the kernel can be executed on.
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, const ConvolutionInfo &info)
Calculate the depthwise convolution output shape of a tensor.
uint32x2_t vmovn(const uint64x2_t &a)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
const size_t conv_pad_left
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
const size_t conv_stride_x
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const DataLayout data_layout
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
const size_t conv_stride_y
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
const size_t num_read_elements_per_iteration
CpuDepthwiseConvolutionNativeKernel()
Default constructor.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
const size_t input_stride_y
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
library fill(src, distribution, 0)
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
int16x4_t vreinterpret(const uint16x4_t &a)
const uint32_t x_leftover_start
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
UniformQuantizationInfo uniform() const
Return per layer quantization info.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
const std::vector< float > & scale() const
Scale vector accessor.
const size_t input_height
uint8x8_t vgetlow(const uint8x16_t val)
void end(TokenStream &in, bool &valid)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
void configure(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ConvolutionInfo &info)
Initialize the function's source, destination and parameters.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
const size_t weights_width
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
const size_t weights_height
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Information about executing thread and CPU.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ConvolutionInfo &info)
Static function to check if given info will lead to a valid configuration of CpuDepthwiseConvolutionN...
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
const size_t weights_stride_y
const size_t input_stride_z
const size_t input_max_offset
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
const size_t conv_pad_top
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
uint16x8_t vmovl(const uint8x8_t &a)
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)