30 #include "src/core/NEON/kernels/convolution/depthwise/impl_qa8_qa8.hpp" 46 constexpr
auto dim_manual_loop = Window::Dimension(0, 0, 0);
47 constexpr
auto dim_single_unit_step = Window::Dimension(0, 1, 1);
48 constexpr
size_t vector_size = 8;
50 struct DepthwiseConvolutionRunInfo
72 DepthwiseConvolutionRunInfo(
const ITensorInfo &
input,
const ITensorInfo &weights,
const PadStrideInfo &
conv_info,
const Window &
w, uint32_t depth_multiplier = 1)
74 x_start(w.x().
start()),
76 x_step(static_cast<uint32_t>(num_read_elements_per_iteration * depth_multiplier)),
77 x_leftover_start(
std::max(static_cast<int32_t>(w.x().
end()) - static_cast<int32_t>(x_step) + 1, int32_t(0))),
78 input_stride_y(input.strides_in_bytes().y()),
79 input_stride_z(input.strides_in_bytes().z()),
80 input_max_offset(input.strides_in_bytes().z() * input.dimension(height_idx) - (input.padding().bottom + input.padding().top) * input.strides_in_bytes().y()),
81 weights_width(weights.dimension(width_idx)),
82 weights_height(weights.dimension(height_idx)),
83 weights_stride_y(weights.strides_in_bytes().y()),
84 weights_stride_z(weights.strides_in_bytes().z()),
85 conv_stride_x(conv_info.stride().first),
86 conv_stride_y(conv_info.stride().second),
87 conv_pad_left(conv_info.pad_left()),
88 conv_pad_top(conv_info.pad_top()),
89 input_height(input.dimension(height_idx)),
90 input_width(input.dimension(width_idx)),
91 input_depth(input.dimension(channel_idx))
96 inline bool is_valid_input_region(int32_t base_w, uint32_t base_h, uint32_t
w, uint32_t h,
const DepthwiseConvolutionRunInfo &run_info,
const Size2D &dilation)
98 const int32_t current_h = base_h + h * dilation.y();
99 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height);
101 const int32_t current_w = base_w + w * dilation.x();
102 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width);
104 return is_valid_h && is_valid_w;
107 template <
typename T>
108 void depthwise_loop_multiplier1_fp(
const ITensor *
input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &
conv_info,
109 const Size2D &dilation,
const Window &window,
bool has_biases)
111 constexpr
auto element_per_vector = vector_size /
sizeof(T);
113 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
115 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(),
conv_info, window);
117 const VectorType zero_vector =
wrapper::vdup_n(static_cast<T>(0), TagType{});
119 Window execution_window = window;
120 execution_window.set(
Window::DimX, dim_single_unit_step);
122 Window win_input = window;
127 Window win_weights = win_input;
130 Window win_output = window;
133 Iterator input_it(input, win_input);
134 Iterator weights_it(weights, win_weights);
135 Iterator output_it(output, win_output);
136 Iterator biases_it{};
140 biases_it = Iterator(biases, win_weights);
145 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
146 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
147 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
149 auto const base_weights_ptr = weights_it.ptr();
150 uint32_t x = run_info.x_start;
152 for(; x < run_info.x_leftover_start; x += run_info.x_step)
154 VectorType acc = zero_vector;
155 auto weights_ptr = base_weights_ptr;
156 int64_t input_offset = base_input_offset;
158 for(uint32_t h = 0; h < run_info.weights_height; ++h)
160 int64_t offs = input_offset + x *
sizeof(T);
161 for(uint32_t w = 0; w < run_info.weights_width; ++
w)
163 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
164 const auto input_vals = is_valid_region ?
165 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
167 const auto weights_vals =
wrapper::vload(reinterpret_cast<T *>(weights_ptr + w * run_info.weights_stride_y) + x);
170 offs += dilation.x() * run_info.input_stride_y;
173 weights_ptr += run_info.weights_stride_z;
174 input_offset += dilation.y() * run_info.input_stride_z;
179 const auto biases_vals =
wrapper::vload(reinterpret_cast<T *>(biases_it.ptr()) + x);
186 for(; x < run_info.x_end; ++x)
188 auto acc_scalar = T{ 0 };
189 auto weights_ptr = base_weights_ptr;
190 int64_t input_offset = base_input_offset;
192 for(
size_t h = 0; h < run_info.weights_height; ++h)
194 int64_t offs = input_offset + x *
sizeof(T);
195 for(
size_t w = 0; w < run_info.weights_width; ++
w)
197 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
198 const auto input_vals = is_valid_region ? *
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) : 0;
199 const auto weights_vals = *(
reinterpret_cast<T *
>(weights_ptr + w * run_info.weights_stride_y) + x);
201 acc_scalar += (input_vals * weights_vals);
203 offs += dilation.x() * run_info.input_stride_y;
206 weights_ptr += run_info.weights_stride_z;
207 input_offset += dilation.y() * run_info.input_stride_z;
212 const auto biases_vals = *(
reinterpret_cast<T *
>(biases_it.ptr()) + x);
213 acc_scalar += biases_vals;
215 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = acc_scalar;
218 input_it, weights_it, biases_it, output_it);
221 template <
typename T>
222 void depthwise_loop_generic_fp(
const ITensor *input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &conv_info,
223 const Size2D &dilation,
unsigned int depth_multiplier,
const Window &window,
bool has_biases)
225 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(),
conv_info, window, depth_multiplier);
227 Window execution_window = window;
228 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
230 Window win_input = execution_window;
231 win_input.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
235 Window win_weights = window;
236 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
241 Window win_output = window;
242 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
244 Iterator input_it(input, win_input);
245 Iterator weights_it(weights, win_weights);
246 Iterator output_it(output, win_output);
247 Iterator biases_it{};
251 biases_it = Iterator(biases, win_weights);
256 std::vector<T> acc(depth_multiplier, static_cast<T>(0));
258 const int input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
259 const int input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
260 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
262 auto weights_ptr = weights_it.ptr();
263 for(
size_t h = 0; h < run_info.weights_height; ++h)
265 int offs = input_offset;
266 for(
size_t w = 0; w < run_info.weights_width; ++
w)
268 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
269 const auto input_val = is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : T(0);
271 for(
size_t m = 0; m < depth_multiplier; ++m)
273 const auto weights_val = *(
reinterpret_cast<T *
>(weights_ptr + m *
sizeof(T) + w * run_info.weights_stride_y));
277 offs += dilation.x() * run_info.input_stride_y;
280 weights_ptr += run_info.weights_stride_z;
281 input_offset += dilation.y() * run_info.input_stride_z;
286 for(
size_t m = 0; m < depth_multiplier; ++m)
288 const auto biases_val = *(
reinterpret_cast<T *
>(biases_it.ptr() + m *
sizeof(T)));
289 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) = acc.at(m) + biases_val;
294 for(
size_t m = 0; m < depth_multiplier; ++m)
296 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) = acc.at(m);
300 input_it, weights_it, biases_it, output_it);
303 template <
typename T,
typename TW>
304 void depthwise_loop_multiplier1_quantized(
const ITensor *input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &conv_info,
305 const Size2D &dilation, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
307 constexpr
auto element_per_vector = vector_size /
sizeof(T);
309 using TagType =
typename wrapper::traits::neon_vector<T, element_per_vector>::tag_type;
310 using AccType = int32_t;
311 using AccArrayType = std::array<AccType, element_per_vector>;
313 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
314 const auto out_of_bound_vector =
wrapper::vdup_n(static_cast<T>(out_of_bound_value), TagType{});
316 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(),
conv_info, window);
318 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
319 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
320 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
321 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
323 Window execution_window = window;
324 execution_window.set(
Window::DimX, dim_single_unit_step);
326 Window win_input = window;
331 Window win_weights = win_input;
334 Window win_output = window;
337 Iterator input_it(input, win_input);
338 Iterator weights_it(weights, win_weights);
339 Iterator output_it(output, win_output);
340 Iterator biases_it{};
344 biases_it = Iterator(biases, win_weights);
349 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
350 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
351 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
352 auto const base_weights_ptr = weights_it.ptr();
353 size_t x = run_info.x_start;
355 for(; x < run_info.x_leftover_start; x += run_info.x_step)
358 AccArrayType in_sum{};
359 AccArrayType we_sum{};
361 auto weights_ptr = base_weights_ptr;
362 auto input_offset = base_input_offset;
364 for(
size_t h = 0; h < run_info.weights_height; ++h)
366 int64_t offs = input_offset + x *
sizeof(T);
367 for(
size_t w = 0; w < run_info.weights_width; ++
w)
369 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
370 const auto input_vals = is_valid_region ?
371 wrapper::vload(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) :
373 const auto weights_vals =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr + w * run_info.weights_stride_y) + x);
375 for(
size_t i = 0; i < element_per_vector; ++i)
377 acc.at(i) += input_vals[i] * weights_vals[i];
378 in_sum.at(i) += input_vals[i];
379 we_sum.at(i) += weights_vals[i];
382 offs += dilation.x() * run_info.input_stride_y;
385 weights_ptr += run_info.weights_stride_z;
386 input_offset += dilation.y() * run_info.input_stride_z;
390 for(
size_t i = 0; i < element_per_vector; ++i)
392 acc.at(i) -= in_sum.at(i) * weights_qoffset;
393 acc.at(i) -= we_sum.at(i) * input_qoffset;
394 acc.at(i) += k_offset;
398 acc.at(i) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + i *
sizeof(int32_t)) + x);
401 const int32_t out_mul = output_multiplier.at(x + i);
402 const int32_t out_shift = output_shift.at(x + i);
405 acc.at(i) = saturating_doubling_high_mul(acc.at(i) * (1 << (-out_shift)), out_mul) + output_qoffset;
409 acc.at(i) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(i), out_mul), out_shift) + output_qoffset;
411 out_vals[i] =
static_cast<T
>(utility::clamp<AccType, T>(acc.at(i)));
418 for(; x < run_info.x_end; ++x)
424 auto weights_ptr = base_weights_ptr;
425 auto input_offset = base_input_offset;
427 for(
size_t h = 0; h < run_info.weights_height; ++h)
429 int64_t offs = input_offset + x *
sizeof(T);
430 for(
size_t w = 0; w < run_info.weights_width; ++
w)
432 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
433 const auto input_val = is_valid_region ?
434 *
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset)) :
436 const auto weights_val = *(
reinterpret_cast<TW *
>(weights_ptr + w * run_info.weights_stride_y) + x);
438 acc += input_val * weights_val;
440 we_sum += weights_val;
442 offs += dilation.x() * run_info.input_stride_y;
445 weights_ptr += run_info.weights_stride_z;
446 input_offset += dilation.y() * run_info.input_stride_z;
451 acc -= in_sum * weights_qoffset;
452 acc -= we_sum * input_qoffset;
457 acc += *(
reinterpret_cast<int32_t *
>(biases_it.ptr()) + x);
460 const int32_t out_mul = output_multiplier.at(x);
461 const int32_t out_shift = output_shift.at(x);
465 acc = saturating_doubling_high_mul(acc * (1 << (-out_shift)), out_mul) + output_qoffset;
469 acc = rounding_divide_by_exp2(saturating_doubling_high_mul(acc, out_mul), out_shift) + output_qoffset;
472 out_vals =
static_cast<T
>(utility::clamp<AccType, T>(acc));
473 *(
reinterpret_cast<T *
>(output_it.ptr()) + x) = out_vals;
476 input_it, weights_it, biases_it, output_it);
479 template <
typename T,
typename TW>
480 void depthwise_loop_generic_quantized(
const ITensor *input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &conv_info,
481 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
483 using AccType = int32_t;
485 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(),
conv_info, window, depth_multiplier);
487 const auto out_of_bound_value = PixelValue(static_cast<uint64_t>(0), input->info()->data_type(), input->info()->quantization_info()).get<T>();
489 const int32_t input_qoffset = input->info()->quantization_info().uniform().offset;
490 const int32_t weights_qoffset = weights->info()->quantization_info().uniform().offset;
491 const int32_t output_qoffset = output->info()->quantization_info().uniform().offset;
492 const int32_t k_offset = run_info.weights_width * run_info.weights_height * input_qoffset * weights_qoffset;
494 Window execution_window = window;
495 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
497 Window win_input = execution_window;
501 Window win_weights = window;
502 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
507 Window win_output = window;
508 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
510 Iterator input_it(input, win_input);
511 Iterator weights_it(weights, win_weights);
512 Iterator output_it(output, win_output);
513 Iterator biases_it{};
517 biases_it = Iterator(biases, win_weights);
522 std::vector<AccType> acc(depth_multiplier, 0);
523 std::vector<AccType> we_sum(depth_multiplier, 0);
526 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
527 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
528 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
530 auto weights_ptr = weights_it.ptr();
531 for(
size_t h = 0; h < run_info.weights_height; ++h)
533 int offs = input_offset;
534 for(
size_t w = 0; w < run_info.weights_width; ++
w)
536 const bool is_valid_region = is_valid_input_region(input_y, input_z, w, h, run_info, dilation);
537 const auto input_val = is_valid_region ? *(
reinterpret_cast<T *
>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))) : out_of_bound_value;
539 for(
size_t m = 0; m < depth_multiplier; ++m)
541 const auto weights_val = *(
reinterpret_cast<TW *
>(weights_ptr + m *
sizeof(T) + w * run_info.weights_stride_y));
542 acc.at(m) += input_val * weights_val;
544 we_sum.at(m) += weights_val;
547 offs += dilation.x() * run_info.input_stride_y;
551 weights_ptr += run_info.weights_stride_z;
552 input_offset += dilation.y() * run_info.input_stride_z;
555 for(
size_t m = 0; m < depth_multiplier; ++m)
557 acc.at(m) -= in_sum * weights_qoffset;
558 acc.at(m) -= we_sum.at(m) * input_qoffset;
559 acc.at(m) += k_offset;
563 acc.at(m) += *(
reinterpret_cast<int32_t *
>(biases_it.ptr() + m *
sizeof(int32_t)));
566 const int32_t out_mul = output_multiplier.at(
id.x() * depth_multiplier + m);
567 const int32_t out_shift = output_shift.at(
id.x() * depth_multiplier + m);
570 acc.at(m) = saturating_doubling_high_mul(acc.at(m) * (1 << (-out_shift)), out_mul) + output_qoffset;
574 acc.at(m) = rounding_divide_by_exp2(saturating_doubling_high_mul(acc.at(m), out_mul), out_shift) + output_qoffset;
576 *(
reinterpret_cast<T *
>(output_it.ptr() + m *
sizeof(T))) = static_cast<T>(utility::clamp<AccType, T>(acc.at(m)));
579 input_it, weights_it, biases_it, output_it);
582 template <
typename T,
typename TW>
583 void depthwise_loop_pow2_quantized_per_tensor(
const ITensor *input,
const ITensor *weights,
const ITensor *biases, ITensor *output,
const PadStrideInfo &conv_info,
584 const Size2D &dilation,
unsigned int depth_multiplier, std::vector<int> output_multiplier, std::vector<int> output_shift,
const Window &window,
bool has_biases)
586 constexpr
int half_vec = vector_size / 2;
588 using AccType = int32_t;
590 using AccVectorTagType =
typename wrapper::traits::neon_vector<AccType, half_vec>::tag_type;
591 using TagType =
typename wrapper::traits::neon_vector<T, vector_size>::tag_type;
593 const auto run_info = DepthwiseConvolutionRunInfo(*input->info(), *weights->info(),
conv_info, window, depth_multiplier);
600 const auto upper =
wrapper::vdup_n(static_cast<AccType>(std::numeric_limits<T>::max()), AccVectorTagType{});
601 const auto zero =
wrapper::vdup_n(static_cast<AccType>(0), AccVectorTagType{});
603 const auto out_mul = output_multiplier.at(0);
604 const auto out_shift = output_shift.at(0);
606 Window execution_window = window;
607 execution_window.set(
Window::DimX, Window::Dimension(0, run_info.input_depth, 1));
609 Window win_input = execution_window;
613 Window win_weights = window;
614 win_weights.set_dimension_step(
Window::DimX, run_info.x_step);
619 Window win_output = window;
620 win_output.set_dimension_step(
Window::DimX, run_info.x_step);
622 Iterator input_it(input, win_input);
623 Iterator weights_it(weights, win_weights);
624 Iterator output_it(output, win_output);
625 Iterator biases_it{};
629 biases_it = Iterator(biases, win_weights);
632 std::vector<AccVectorType> acc0(depth_multiplier / vector_size);
633 std::vector<AccVectorType> acc1(depth_multiplier / vector_size);
640 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
641 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
642 int64_t input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
644 auto weights_ptr = weights_it.ptr();
645 for(
size_t h = 0; h < run_info.weights_height; ++h)
647 const int32_t current_h = input_z + h * dilation.y();
648 if(current_h >= 0 && current_h < static_cast<int32_t>(run_info.input_height))
650 int offs = input_offset;
651 for(
size_t w = 0; w < run_info.weights_width; ++
w)
653 const int32_t current_w = input_y + w * dilation.x();
654 if(current_w >= 0 && current_w < static_cast<int32_t>(run_info.input_width))
656 const auto input_8x8 =
wrapper::vdup_n(*(reinterpret_cast<T *>(input_it.ptr() + std::min(static_cast<size_t>(offs), run_info.input_max_offset))), TagType{});
658 const auto input_no_offs =
wrapper::vsub(input_s16x8, input_qoffset_vec);
660 for(
size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
662 const auto weights_8x8 =
wrapper::vload(reinterpret_cast<TW *>(weights_ptr + m *
sizeof(T) + w * run_info.weights_stride_y));
664 const auto weights_no_offs =
wrapper::vsub(weights_s16x8, weights_qoffset_vec);
671 offs += dilation.x() * run_info.input_stride_y;
675 weights_ptr += run_info.weights_stride_z;
676 input_offset += dilation.y() * run_info.input_stride_z;
679 for(
size_t m = 0, i = 0; m < depth_multiplier; m += vector_size, ++i)
683 const auto bias_val0 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + m *
sizeof(int32_t)));
684 const auto bias_val1 =
wrapper::vloadq(reinterpret_cast<int32_t *>(biases_it.ptr() + (m + half_vec) *
sizeof(int32_t)));
692 acc0.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc0.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
693 acc1.at(i) =
wrapper::vadd(saturating_doubling_high_mul(acc1.at(i) * (1 << (-out_shift)), out_mul), output_qoffset_vec);
697 acc0.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc0.at(i), out_mul), out_shift), output_qoffset_vec);
698 acc1.at(i) =
wrapper::vadd(rounding_divide_by_exp2(saturating_doubling_high_mul(acc1.at(i), out_mul), out_shift), output_qoffset_vec);
707 if(std::is_same<T, uint8_t>::value)
717 input_it, weights_it, biases_it, output_it);
720 Status
validate_arguments(
const ITensorInfo *input,
const ITensorInfo *weights,
const ITensorInfo *biases,
const ITensorInfo *output,
const PadStrideInfo &conv_info,
unsigned int depth_multiplier,
721 const Size2D &dilation)
728 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(1) + (weights->dimension(1) - 1) * (dilation.x() - 1) > input->dimension(1) + conv_info.pad_left() + conv_info.pad_right());
729 ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(2) + (weights->dimension(2) - 1) * (dilation.y() - 1) > input->dimension(2) + conv_info.pad_top() + conv_info.pad_bottom());
744 if(biases !=
nullptr)
759 if(output->total_size() != 0)
771 : _func(), _input(), _weights(), _biases(), _output(), _conv_info(), _depth_multiplier(1), _dilation(), _output_multiplier(), _output_shift(), _has_biases()
786 _depth_multiplier = depth_multiplier;
787 _dilation = dilation;
788 _has_biases = (biases !=
nullptr);
798 for(
size_t i = 1; i < _weights->
info()->
dimension(channel_idx); ++i)
800 weights_scale.push_back(weights_scale.front());
804 for(
const auto &s : weights_scale)
806 int32_t out_mult = 0;
807 int32_t out_shift = 0;
808 const float multiplier = input_scale * s / output_scale;
811 _output_multiplier.push_back(out_mult);
812 _output_shift.push_back(out_shift);
819 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, uint8_t>;
822 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
827 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<uint8_t, int8_t>;
831 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<int8_t, int8_t>;
834 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 836 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float16_t, float16_t>;
838 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 840 _func = &NEDepthwiseConvolutionLayerNativeKernel::run_depthwise<float, float>;
854 INEKernel::configure(win);
858 unsigned int depth_multiplier,
871 (this->*_func)(window, _has_biases);
874 template <
typename T,
typename TW, NEDepthwiseConvolutionLayerNativeKernel::FloatEnalber<T>>
875 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(
const Window &
window,
bool has_biases)
880 if(_depth_multiplier == 1)
882 depthwise_loop_multiplier1_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation,
window, has_biases);
886 depthwise_loop_generic_fp<T>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier,
window, has_biases);
890 template <
typename T,
typename TW, NEDepthwiseConvolutionLayerNativeKernel::Quantized8bitEnalber<T>>
891 void NEDepthwiseConvolutionLayerNativeKernel::run_depthwise(
const Window &window,
bool has_biases)
896 if(_depth_multiplier == 1)
898 depthwise_loop_multiplier1_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _output_multiplier, _output_shift,
window, has_biases);
902 const bool is_pow2 = ((_depth_multiplier & (_depth_multiplier - 1)) == 0);
905 if(is_pow2 && is_quantized_per_tensor && _depth_multiplier >= 8)
907 depthwise_loop_pow2_quantized_per_tensor<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift,
window, has_biases);
911 depthwise_loop_generic_quantized<T, TW>(_input, _weights, _biases, _output, _conv_info, _dilation, _depth_multiplier, _output_multiplier, _output_shift,
window, has_biases);
bool is_data_type_quantized(DataType dt)
Check if a given data type is of quantized type.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
const size_t conv_pad_top
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Traits defined on Neon vectors.
const Window & window() const
The maximum window the kernel can be executed on.
uint32x2_t vmovn(const uint64x2_t &a)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
TensorShape compute_depthwise_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info, unsigned int depth_multiplier, const Size2D &dilation=Size2D(1U, 1U))
Calculate the depthwise convolution output shape of a tensor.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
const size_t weights_stride_y
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
size_t element_size_from_data_type(DataType dt)
The size in bytes of the data type.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
const size_t input_height
const DataLayout data_layout
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Status calculate_quantized_multiplier(float multiplier, int32_t *quant_multiplier, int32_t *shift, bool ignore_epsilon=false)
Calculate quantized representation of multiplier.
const size_t input_stride_y
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Interface for Neon tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
const size_t weights_height
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
uint32x2_t vqmovn(const uint64x2_t &a)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
library fill(src, distribution, 0)
bool is_data_type_quantized_per_channel(DataType dt)
Check if a given data type is of per channel type.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
int16x4_t vreinterpret(const uint16x4_t &a)
uint8x8_t vmin(const uint8x8_t &a, const uint8x8_t &b)
const size_t conv_stride_x
UniformQuantizationInfo uniform() const
Return per layer quantization info.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
const std::vector< float > & scale() const
Scale vector accessor.
uint8x8_t vgetlow(const uint8x16_t val)
Padding and stride information class.
void end(TokenStream &in, bool &valid)
uint8x16_t vcombine(const uint8x8_t &a, const uint8x8_t &b)
static constexpr size_t DimW
Alias for dimension 3 also known as W dimension.
NEDepthwiseConvolutionLayerNativeKernel()
Default constructor.
virtual QuantizationInfo quantization_info() const =0
Get the quantization settings (scale and offset) of the tensor.
uint16x8_t vmlal(const uint16x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
bool is_data_type_quantized_asymmetric(DataType dt)
Check if a given data type is of asymmetric quantized type.
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
const size_t conv_stride_y
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
T fma(T x, T y, T z)
Computes (x*y) + z as if to infinite precision and rounded only once to fit the result type...
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
const size_t num_read_elements_per_iteration
Class for specifying the size of an image or rectangle.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
uint8x8_t vload(const uint8_t *ptr)
void vstore(uint8_t *ptr, uint8x8_t val)
const size_t weights_stride_z
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
const size_t input_stride_z
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Container for valid region of a window.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Initialize the function's source, destination and parameters.
uint16x8_t vmovl(const uint8x8_t &a)
const size_t weights_width
uint8x8_t vmax(const uint8x8_t &a, const uint8x8_t &b)
const uint32_t x_leftover_start
const size_t conv_pad_left
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
const size_t input_max_offset
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier=1, const Size2D &dilation=Size2D(1U, 1U))
Static function to check if given info will lead to a valid configuration of NEDepthwiseConvolutionLa...