51 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 52 template <
unsigned int str
idex>
53 float16x8_t internal_vld1q(
const float16_t *in);
56 float16x8_t internal_vld1q<1>(
const float16_t *in)
62 float16x8_t internal_vld1q<2>(
const float16_t *in)
64 const float16x8x2_t tmp = vld2q_f16(in);
69 float16x8_t internal_vld1q<3>(
const float16_t *in)
71 const float16x8x3_t tmp = vld3q_f16(in);
75 inline float16x8_t internal_vdupq_n(float16_t v)
77 return vdupq_n_f16(v);
80 inline void internal_vst1q(float16_t *p,
const float16x8_t &v)
85 float16x8_t internal_vmull(
const float16x8_t &x,
const float16x8_t &y)
90 inline float16x8_t internal_vmlal(
const float16x8_t &x,
const float16x8_t &y,
const float16x8_t &z)
96 template <
unsigned int str
idex>
97 float32x4_t internal_vld1q(
const float *in);
100 float32x4_t internal_vld1q<1>(
const float *in)
102 return vld1q_f32(in);
106 float32x4_t internal_vld1q<2>(
const float *in)
108 const float32x4x2_t tmp = vld2q_f32(in);
113 float32x4_t internal_vld1q<3>(
const float *in)
115 const float32x4x3_t tmp = vld3q_f32(in);
119 inline float32x4_t internal_vdupq_n(
float v)
121 return vdupq_n_f32(v);
124 inline void internal_vst1q(
float *p,
const float32x4_t &v)
129 float32x4_t internal_vmull(
const float32x4_t &x,
const float32x4_t &y)
131 return vmulq_f32(x, y);
134 inline float32x4_t internal_vmlal(
const float32x4_t &x,
const float32x4_t &y,
const float32x4_t &z)
136 return vmlaq_f32(x, y, z);
139 constexpr
int small_tensor_size_optim = 8;
140 inline bool run_optim_small_tensor_info(
const ITensorInfo *
t)
142 return t->dimension(
Window::DimX) <= small_tensor_size_optim && t->dimension(
Window::DimY) <= small_tensor_size_optim;
145 inline bool run_optim_small_tensor(
const ITensor *t)
147 return run_optim_small_tensor_info(t->info());
153 template <
unsigned int str
idex>
154 class convolver_w1x1_i8x8_f32
157 static void convolve(
const Window &window,
const ITensor *
input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &
conv_info)
162 const int input_stride_x = input->info()->strides_in_bytes().x();
165 const int output_stride_y = output->info()->strides_in_bytes().y();
166 const int output_stride_z = output->info()->strides_in_bytes().z();
167 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
168 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
169 const int output_h = output->info()->dimension(1);
170 const int range_z = window.z().end() - window.z().start();
171 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
172 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
177 Window window_out = window;
180 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
183 Window window_in = window;
185 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
186 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
187 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
190 Iterator out(output, window_out);
191 Iterator in(input, window_in);
192 Iterator k(weights, window_k);
194 const uint8_t *k_ptr = k.ptr();
198 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top *
input_stride_y;
199 uint8_t *out_ptr = out.ptr();
202 std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
203 std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
204 for(
int oz = 0; oz < range_z; ++oz)
206 accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
207 accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
208 auto p_out_base = out_ptr + oz * output_stride_z;
209 for(
int p = 0; p < kernel_depth; ++p)
211 const auto k_val =
reinterpret_cast<const float *
>(k_ptr + p * kernel_stride_z + (
id.z() + oz) * kernel_stride_w);
212 const auto vk0 = internal_vdupq_n(*k_val);
213 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
216 auto in_val =
reinterpret_cast<const float *
>(input_ptr + p * input_stride_z + offset_xy);
217 auto v_in0 = internal_vld1q<stridex>(in_val);
218 auto v_in1 = internal_vld1q<stridex>(in_val + 4);
219 accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0);
220 accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1);
223 for(oh = 0; oh < output_h; ++oh)
225 auto p_out =
reinterpret_cast<float *
>(p_out_base + oh * output_stride_y);
226 vst1q_f32(p_out, accum0[oh]);
227 vst1q_f32(p_out + 4, accum1[oh]);
235 template <
typename T1,
typename T2,
unsigned int str
idex>
239 static void convolve(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
240 const ITensor *input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &conv_info)
242 const int input_stride_x = input->info()->strides_in_bytes().x();
243 const int input_stride_y = input->info()->strides_in_bytes().y();
244 const int input_stride_z = input->info()->strides_in_bytes().z();
245 const int output_stride_y = output->info()->strides_in_bytes().y();
246 const int output_stride_z = output->info()->strides_in_bytes().z();
247 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
248 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
249 const int output_w = output->info()->dimension(0);
250 const int output_h = output->info()->dimension(1);
251 const int range_z = window.z().end() - window.z().start();
252 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
253 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
254 const unsigned int conv_pad_left = conv_info.pad_left();
255 const unsigned int conv_pad_top = conv_info.pad_top();
258 Window window_out = window;
261 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
264 Window window_in = window;
266 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
267 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
268 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
271 Iterator out(output, window_out);
272 Iterator in(input, window_in);
273 Iterator k(weights, window_k);
275 const uint8_t *k_ptr = k.ptr();
282 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top *
input_stride_y;
283 uint8_t *out_ptr = out.ptr();
286 for(
int oz = 0; oz < range_z; ++oz)
288 auto p_out_base = out_ptr + oz * output_stride_z;
291 const auto k_val =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + (
id.z() + oz) * kernel_stride_w);
292 const auto vk = internal_vdupq_n(*k_val);
293 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
296 auto in_val =
reinterpret_cast<const T1 *
>(input_ptr + (0 * input_stride_z + offset_xy));
297 auto p_out =
reinterpret_cast<T2 *
>(p_out_base + oh * output_stride_y);
298 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
300 internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
306 for(
int p = 1; p < kernel_depth; ++p)
308 const auto k_val =
reinterpret_cast<const T1 *
>(k_ptr + p * kernel_stride_z + (
id.z() + oz) * kernel_stride_w);
309 const auto vk = internal_vdupq_n(*k_val);
310 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
313 auto in_val =
reinterpret_cast<const T1 *
>(input_ptr + p * input_stride_z + offset_xy);
314 auto p_out =
reinterpret_cast<T2 *
>(p_out_base + oh * output_stride_y);
315 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
317 internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
327 template <
unsigned int str
idex>
328 float32x4x2_t convolve_5x5(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
329 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4);
331 inline float32x4x3_t load_matrix_hi(
const float *
const m0,
const float *
const m1,
const float *
const m2)
333 const float32x4x3_t m00 =
344 inline float32x4x2_t load_matrix_lo(
const float *
const m3,
const float *
const m4)
346 const float32x4x2_t m00 =
356 inline float32x4x3_t load_input(
const float *
const in)
358 const float32x4x3_t vin =
370 inline float32x4x2_t convolve_5x5<1>(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
371 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4)
373 const float32x4x3_t vin0 = load_input(in_0);
374 const float32x4x3_t vin1 = load_input(in_1);
375 const float32x4x3_t vin2 = load_input(in_2);
376 const float32x4x3_t vin3 = load_input(in_3);
377 const float32x4x3_t vin4 = load_input(in_4);
378 const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);
379 const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);
380 const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);
381 const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);
382 const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);
383 const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);
384 const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);
385 const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);
386 const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);
387 const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);
392 vmulq_f32(vin0.val[0], m00.val[0]),
393 vmulq_f32(vin0.val[1], m00.val[0])
397 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
398 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
399 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
400 out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
402 out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
403 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
404 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
405 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
406 out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
408 out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
409 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
410 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
411 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
412 out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
414 out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
415 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
416 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
417 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
418 out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
420 out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
421 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
422 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
423 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
424 out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
426 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
427 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
428 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
429 out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
431 out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
432 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
433 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
434 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
435 out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
437 out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
438 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
439 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
440 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
441 out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
443 out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
444 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
445 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
446 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
447 out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
449 out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
450 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
451 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
452 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
453 out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
459 inline float32x4x2_t convolve_5x5<2>(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
460 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4)
462 float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
463 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
464 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
465 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
470 inline float32x4x2_t convolve_5x5<3>(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
471 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4)
473 float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
474 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
478 template <
typename T1,
typename T2,
unsigned int str
idex>
482 static void convolve(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
483 const ITensor *input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &conv_info)
486 const int input_stride_x = input->info()->strides_in_bytes().x();
487 const int input_stride_y = input->info()->strides_in_bytes().y();
488 const int input_stride_z = input->info()->strides_in_bytes().z();
489 const int output_stride_y = output->info()->strides_in_bytes().y();
490 const int output_stride_z = output->info()->strides_in_bytes().z();
491 const int kernel_stride_x = weights->info()->strides_in_bytes().x();
492 const int kernel_stride_y = weights->info()->strides_in_bytes().y();
493 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
494 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
495 const int output_w = output->info()->dimension(0);
496 const int output_h = output->info()->dimension(1);
497 const int num_planes_z = window.z().end() - window.z().start();
499 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
500 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
501 const unsigned int conv_pad_left = conv_info.pad_left();
502 const unsigned int conv_pad_top = conv_info.pad_top();
505 Window window_out = window;
508 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
511 Window window_in = window;
513 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
514 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
515 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
519 Iterator out(output, window_out);
520 Iterator in(input, window_in);
521 Iterator k(weights, window_k);
523 const uint8_t *k_ptr = k.ptr();
527 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top *
input_stride_y;
528 uint8_t *out_ptr = out.ptr();
544 for(
int oz = 0; oz < num_planes_z; ++oz)
546 const int zoffset =
id.z() + oz;
547 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
550 const auto ptr_k_r0 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
551 const auto ptr_k_r1 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
552 const auto ptr_k_r2 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
556 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
558 auto in_top =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
559 auto in_mid =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
560 auto in_low =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
561 auto p_out =
reinterpret_cast<T2 *
>(p_out_base + oh * output_stride_y);
562 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
563 in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
565 convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
570 for(
int p = 1; p < kernel_depth; ++p)
572 const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
574 const auto ptr_k_r0 =
reinterpret_cast<const T1 *
>(ptr_k_base);
575 const auto ptr_k_r1 =
reinterpret_cast<const T1 *
>(ptr_k_base + kernel_stride_y);
576 const auto ptr_k_r2 =
reinterpret_cast<const T1 *
>(ptr_k_base + kernel_stride_y * 2);
580 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
582 auto in_top =
reinterpret_cast<const T1 *
>(input_base + (ih + 0) * input_stride_y);
583 auto in_mid =
reinterpret_cast<const T1 *
>(input_base + (ih + 1) * input_stride_y);
584 auto in_low =
reinterpret_cast<const T1 *
>(input_base + (ih + 2) * input_stride_y);
585 auto p_out =
reinterpret_cast<T2 *
>(p_out_base + oh * output_stride_y);
586 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
587 in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
589 convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
599 template <
typename T1,
typename T2,
unsigned int str
idex>
603 static void convolve(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
604 const ITensor *input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &conv_info)
607 const int input_stride_x = input->info()->strides_in_bytes().x();
608 const int input_stride_y = input->info()->strides_in_bytes().y();
609 const int input_stride_z = input->info()->strides_in_bytes().z();
610 const int output_stride_y = output->info()->strides_in_bytes().y();
611 const int output_stride_z = output->info()->strides_in_bytes().z();
612 const int kernel_stride_x = weights->info()->strides_in_bytes().x();
613 const int kernel_stride_y = weights->info()->strides_in_bytes().y();
614 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
615 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
616 const int output_w = output->info()->dimension(0);
617 const int output_h = output->info()->dimension(1);
618 const int num_planes_z = window.z().end() - window.z().start();
620 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
621 const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
622 const unsigned int conv_pad_left = conv_info.pad_left();
623 const unsigned int conv_pad_top = conv_info.pad_top();
626 Window window_out = window;
629 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
632 Window window_in = window;
634 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
635 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
636 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
640 Iterator out(output, window_out);
641 Iterator in(input, window_in);
642 Iterator k(weights, window_k);
644 const uint8_t *k_ptr = k.ptr();
648 const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top *
input_stride_y;
649 uint8_t *out_ptr = out.ptr();
652 for(
int oz = 0; oz < num_planes_z; ++oz)
654 const int zoffset =
id.z() + oz;
655 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
658 const auto ptr_k_r0 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
659 const auto ptr_k_r1 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
660 const auto ptr_k_r2 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
661 const auto ptr_k_r3 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
662 const auto ptr_k_r4 =
reinterpret_cast<const T1 *
>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
663 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
665 auto in_0 =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
666 auto in_1 =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
667 auto in_2 =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
668 auto in_3 =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
669 auto in_4 =
reinterpret_cast<const T1 *
>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
670 auto p_out =
reinterpret_cast<T2 *
>(p_out_base + oh * output_stride_y);
671 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
672 in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
674 auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
675 store_results<stridex>(p_out, vres);
680 for(
int p = 1; p < kernel_depth; ++p)
682 const auto ptr_k_r0 =
reinterpret_cast<const T1 *
>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
683 const auto ptr_k_r1 =
reinterpret_cast<const T1 *
>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
684 const auto ptr_k_r2 =
reinterpret_cast<const T1 *
>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
685 const auto ptr_k_r3 =
reinterpret_cast<const T1 *
>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
686 const auto ptr_k_r4 =
reinterpret_cast<const T1 *
>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
688 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
690 auto in_0 =
reinterpret_cast<const T1 *
>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
691 auto in_1 =
reinterpret_cast<const T1 *
>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
692 auto in_2 =
reinterpret_cast<const T1 *
>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
693 auto in_3 =
reinterpret_cast<const T1 *
>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
694 auto in_4 =
reinterpret_cast<const T1 *
>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
695 auto p_out =
reinterpret_cast<T2 *
>(p_out_base + oh * output_stride_y);
696 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
697 in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
699 auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
700 accumulate_results<stridex>(p_out, vres);
710 float vreduce(
const float32x4_t &v)
721 template <
typename T1,
typename T2>
722 inline void convolve_1x1(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
723 const ITensor *input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &conv_info)
725 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
726 switch(conv_stride_x)
729 convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
732 convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
735 convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
743 inline void convolve_1x1<float, float>(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
744 const ITensor *
input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &
conv_info)
746 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
747 if(run_optim_small_tensor(input))
749 switch(conv_stride_x)
752 convolver_w1x1_i8x8_f32<1>::convolve(window, input, weights, output, conv_info);
755 convolver_w1x1_i8x8_f32<2>::convolve(window, input, weights, output, conv_info);
758 convolver_w1x1_i8x8_f32<3>::convolve(window, input, weights, output, conv_info);
766 switch(conv_stride_x)
769 convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
772 convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
775 convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
783 template <
typename T1,
typename T2>
784 inline void convolve_3x3(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
785 const ITensor *input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &conv_info)
787 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
788 switch(conv_stride_x)
791 convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
794 convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
797 convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
804 template <
typename T1,
typename T2>
805 inline void convolve_5x5(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
806 const ITensor *input,
const ITensor *weights, ITensor *output,
const PadStrideInfo &conv_info)
808 const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
809 switch(conv_stride_x)
812 convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
815 convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
818 convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info);
825 Status
validate_arguments(
const ITensorInfo *input,
const ITensorInfo *weights,
const ITensorInfo *output,
const PadStrideInfo &conv_info)
846 if(output->total_size() != 0)
859 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *weights, ITensorInfo *output,
const PadStrideInfo &conv_info,
unsigned int &num_weight_elems_read_per_row,
860 unsigned int &num_elems_read_per_iteration,
unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
864 const DataLayout data_layout = input->data_layout();
868 unsigned int kernel_size = weights->dimension(width_idx);
869 const int conv_stride_x = std::get<0>(conv_info.stride());
870 const int conv_stride_y = std::get<1>(conv_info.stride());
871 const int input_width = input->dimension(width_idx);
874 bool window_changed =
false;
882 switch(input->data_type())
884 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 886 num_elems_written_per_iteration = 8;
890 if(run_optim_small_tensor_info(input))
892 num_elems_written_per_iteration = 8;
896 num_elems_written_per_iteration = 4;
903 num_weight_elems_read_per_row = kernel_size;
904 num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;
908 switch(input->data_type())
911 num_weight_elems_read_per_row = 4 + kernel_size - 1;
912 num_elems_read_per_iteration = 12;
915 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 917 num_weight_elems_read_per_row = 8 + kernel_size - 1;
918 num_elems_read_per_iteration = 24;
929 switch(input->data_type())
932 num_weight_elems_read_per_row = 4 + kernel_size - 1;
933 num_elems_read_per_iteration = 12;
950 int start_x = kernel_size / 2 -
static_cast<int>(conv_info.pad_left());
955 const unsigned int conv_pad_left = conv_info.pad_left();
956 const unsigned int conv_pad_top = conv_info.pad_top();
957 const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
958 const unsigned int conv_pad_bottom = conv_info.pad_bottom();
962 border_size.right = conv_pad_right;
963 border_size.bottom = conv_pad_bottom;
968 AccessWindowRectangle input_access(input, -conv_pad_left, -conv_pad_top,
969 num_elems_read_per_iteration, kernel_size,
970 conv_stride_x, conv_stride_y);
971 AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
972 AccessWindowHorizontal output_access(output, 0, num_elems_written_per_iteration);
974 output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
981 coord.set_num_dimensions(output->num_dimensions());
982 output->set_valid_region(ValidRegion(coord, output->tensor_shape()));
986 return std::make_pair(err, win);
989 bool have_zero_x_internal_padding(ITensorInfo *input, ITensorInfo *weights)
991 return (input->padding().left == 0 && weights->padding().left == 0 && input->padding().right == 0 && weights->padding().right == 0);
996 template <
typename T>
997 void NEDirectConvolutionLayerKernel::convolve_nhwc_optimized(
const Window &window)
1002 using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1004 using tag_type =
typename vtype::tag_type;
1007 const int element_size = _input->info()->element_size();
1008 const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
1009 const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
1010 const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
1011 const int input_dim_w = _input->info()->dimension(1);
1012 const int input_dim_h = _input->info()->dimension(2);
1014 const int output_stride_c = _output->info()->strides_in_bytes().x();
1016 const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
1017 const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
1018 const int kernel_dim_w = _weights->info()->dimension(1);
1019 const int kernel_dim_h = _weights->info()->dimension(2);
1021 const int conv_pad_top = _conv_info.pad_top();
1022 const int conv_pad_left = _conv_info.pad_left();
1023 const int conv_stride_w = std::get<0>(_conv_info.stride());
1024 const int conv_stride_h = std::get<1>(_conv_info.stride());
1027 Window window_out = window;
1028 window_out.set(
Window::DimX, Window::Dimension(0, 1, 1));
1033 window_w.set(
Window::DimY, Window::Dimension(0, 1, 1));
1034 window_w.set(
Window::DimZ, Window::Dimension(0, 1, 1));
1036 Iterator out(_output, window_out);
1037 Iterator wei(_weights, window_w);
1039 constexpr
int num_elems_read_per_iteration = 16 /
sizeof(T);
1061 const int in_w_start_t =
static_cast<int>(
id.y()) * conv_stride_w - conv_pad_left;
1062 const int in_h_start_t =
static_cast<int>(
id.z()) * conv_stride_h - conv_pad_top;
1063 const int in_w_end_t = in_w_start_t + kernel_dim_w;
1064 const int in_h_end_t = in_h_start_t + kernel_dim_h;
1067 const int in_w_start = std::max(in_w_start_t, 0);
1068 const int in_h_start = std::max(in_h_start_t, 0);
1069 const int in_w_end = std::min(in_w_end_t, input_dim_w);
1070 const int in_h_end = std::min(in_h_end_t, input_dim_h);
1073 const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
1074 const int index_h_start = in_h_start - in_h_start_t;
1075 const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
1076 const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1085 const T *in_ptr_row =
reinterpret_cast<const T *
>(_input->buffer() + _input->info()->offset_first_element_in_bytes())
1086 +
id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
1087 const T *weights_ptr_row =
reinterpret_cast<const T *
>(wei.ptr()) + index_h_start * kernel_stride_h;
1088 uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1090 T out_temp =
static_cast<T
>(0);
1091 for(
int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
1093 const T *in_ptr_mover = in_ptr_row;
1094 int index_wc = index_wc_start;
1095 vector_type out_temp_vec =
wrapper::vdup_n(static_cast<T>(0), tag_type());
1096 for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
1102 out_temp += vreduce(out_temp_vec);
1103 for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
1105 const auto src_val = *(in_ptr_mover);
1106 const auto w_val = *(weights_ptr_row + index_wc);
1107 out_temp += src_val * w_val;
1110 *(
reinterpret_cast<T *
>(out_ptr)) = out_temp;
1117 template <
typename T>
1118 void NEDirectConvolutionLayerKernel::convolve_nhwc(
const Window &window)
1121 using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1123 using tag_type =
typename vtype::tag_type;
1126 const int element_size = _input->info()->element_size();
1127 const int input_stride_w = _input->info()->strides_in_bytes().y() / element_size;
1128 const int input_stride_h = _input->info()->strides_in_bytes().z() / element_size;
1129 const int input_stride_n = _input->info()->strides_in_bytes()[3] / element_size;
1130 const int input_dim_w = _input->info()->dimension(1);
1131 const int input_dim_h = _input->info()->dimension(2);
1133 const int output_stride_c = _output->info()->strides_in_bytes().x();
1135 const unsigned int kernel_stride_w = _weights->info()->strides_in_bytes().y() / element_size;
1136 const unsigned int kernel_stride_h = _weights->info()->strides_in_bytes().z() / element_size;
1137 const int kernel_dim_w = _weights->info()->dimension(1);
1138 const int kernel_dim_h = _weights->info()->dimension(2);
1140 const int conv_pad_top = _conv_info.pad_top();
1141 const int conv_pad_left = _conv_info.pad_left();
1142 const int conv_stride_w = std::get<0>(_conv_info.stride());
1143 const int conv_stride_h = std::get<1>(_conv_info.stride());
1146 Window window_out = window;
1147 window_out.set(
Window::DimX, Window::Dimension(0, 1, 1));
1152 window_w.set(
Window::DimY, Window::Dimension(0, 1, 1));
1153 window_w.set(
Window::DimZ, Window::Dimension(0, 1, 1));
1155 Iterator out(_output, window_out);
1156 Iterator wei(_weights, window_w);
1158 constexpr
int num_elems_read_per_iteration = 16 /
sizeof(T);
1163 const int in_w_start_t =
static_cast<int>(
id.y()) * conv_stride_w - conv_pad_left;
1164 const int in_h_start_t =
static_cast<int>(
id.z()) * conv_stride_h - conv_pad_top;
1165 const int in_w_end_t = in_w_start_t + kernel_dim_w;
1166 const int in_h_end_t = in_h_start_t + kernel_dim_h;
1169 const int in_w_start = std::max(in_w_start_t, 0);
1170 const int in_h_start = std::max(in_h_start_t, 0);
1171 const int in_w_end = std::min(in_w_end_t, input_dim_w);
1172 const int in_h_end = std::min(in_h_end_t, input_dim_h);
1175 const int wei_w_start = in_w_start - in_w_start_t;
1176 const int wei_h_start = in_h_start - in_h_start_t;
1177 const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
1178 const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1180 const int index_c_end = _weights->info()->dimension(0);
1181 const T *
const in_ptr_start =
reinterpret_cast<const T *
>(_input->buffer() + _input->info()->offset_first_element_in_bytes()) +
id[3] * input_stride_n;
1185 const T *
const weights_ptr_start =
reinterpret_cast<const T *
>(wei.ptr());
1186 uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1188 T out_temp =
static_cast<T
>(0);
1189 for(
int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
1191 const T *
const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
1192 const T *
const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
1193 for(
int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
1195 const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
1196 const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
1198 vector_type out_temp_vec =
wrapper::vdup_n(static_cast<T>(0), tag_type());
1199 for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
1205 out_temp += vreduce(out_temp_vec);
1206 for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
1208 const auto src_val = *(in_ptr_mover);
1209 const auto w_val = *(weights_ptr_mover);
1210 out_temp += src_val * w_val;
1214 *(
reinterpret_cast<T *
>(out_ptr)) = out_temp;
1222 : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_weight_elems_read_per_row(0), _num_elems_read_per_iteration(0),
1223 _num_elems_written_per_iteration(0)
1229 return _border_size;
1242 const unsigned int conv_pad_left = conv_info.
pad_left();
1243 const unsigned int conv_pad_top = conv_info.
pad_top();
1244 const unsigned int conv_pad_right = conv_info.
pad_right();
1245 const unsigned int conv_pad_bottom = conv_info.
pad_bottom();
1248 _border_size =
BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
1267 auto win_config = validate_and_configure_window(input->
info(), weights->
info(), output->
info(),
conv_info, _num_weight_elems_read_per_row,
1268 _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
1270 INEKernel::configure(win_config.second);
1275 unsigned int num_weight_elems_read_per_row = 0;
1276 unsigned int num_elems_read_per_iteration = 0;
1277 unsigned int num_elems_written_per_iteration = 0;
1281 weights->
clone().get(),
1282 output->
clone().get(),
1284 num_weight_elems_read_per_row,
1285 num_elems_read_per_iteration,
1286 num_elems_written_per_iteration,
1311 convolve_1x1<float, float>(
window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1313 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1315 convolve_1x1<float16_t, float16_t>(
window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1329 convolve_3x3<float, float>(
window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1331 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1333 convolve_3x3<float16_t, float16_t>(
window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1347 convolve_5x5<float, float>(
window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info);
1368 if(have_zero_x_internal_padding(_input->
info(), _weights->
info()))
1370 convolve_nhwc_optimized<float>(
window);
1374 convolve_nhwc<float>(
window);
const size_t conv_pad_top
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
Container for 2D border size.
void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info)
Set the input, weights, and output tensors.
TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
Calculate the deep convolution shape output shape of a tensor.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const DataLayout data_layout
Store the tensor's metadata.
float16x8_t vaddq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_ERROR_THROW_ON(status)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
unsigned int pad_top() const
Get the top padding.
int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration)
const size_t input_stride_y
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
virtual uint8_t * buffer() const =0
Interface to be implemented by the child class to return a pointer to CPU memory. ...
const size_t conv_stride_x
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
uint8x8_t vgetlow(const uint8x16_t val)
Padding and stride information class.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Num samples, channels, height, width.
uint8x8_t vgethigh(const uint8x16_t val)
const size_t conv_stride_y
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
BorderSize border_size() const override
The size of the border for that kernel.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
NEDirectConvolutionLayerKernel()
Default constructor.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const PadStrideInfo &conv_info)
Static function to check if given info will lead to a valid configuration of NEDirectConvolutionLayer...
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
float32x4x3_t load_matrix_row(const float *ptr)
const size_t input_stride_z
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
unsigned int pad_bottom() const
Get the bottom padding.
DataType
Available data types.
unsigned int pad_left() const
Get the left padding.
DataLayout
[DataLayout enum definition]
const size_t conv_pad_left
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.