Compute Library
 21.11
CpuDirectConv2dKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 
29 #include "arm_compute/core/Error.h"
33 #include "arm_compute/core/Types.h"
34 #include "arm_compute/core/Utils.h"
38 #include "src/core/CPP/Validate.h"
42 
43 #include <algorithm>
44 
45 using namespace arm_compute::detail;
46 
47 namespace arm_compute
48 {
49 namespace cpu
50 {
51 namespace kernels
52 {
53 namespace
54 {
55 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
56 template <unsigned int stridex>
57 float16x8_t internal_vld1q(const float16_t *in);
58 
59 template <>
60 float16x8_t internal_vld1q<1>(const float16_t *in)
61 {
62  return vld1q_f16(in);
63 }
64 
65 template <>
66 float16x8_t internal_vld1q<2>(const float16_t *in)
67 {
68  const float16x8x2_t tmp = vld2q_f16(in);
69  return tmp.val[0];
70 }
71 
72 template <>
73 float16x8_t internal_vld1q<3>(const float16_t *in)
74 {
75  const float16x8x3_t tmp = vld3q_f16(in);
76  return tmp.val[0];
77 }
78 
79 inline float16x8_t internal_vdupq_n(float16_t v)
80 {
81  return vdupq_n_f16(v);
82 }
83 
84 inline void internal_vst1q(float16_t *p, const float16x8_t &v)
85 {
86  vst1q_f16(p, v);
87 }
88 
89 float16x8_t internal_vmull(const float16x8_t &x, const float16x8_t &y)
90 {
91  return vmulq_f16(x, y);
92 }
93 
94 inline float16x8_t internal_vmlal(const float16x8_t &x, const float16x8_t &y, const float16x8_t &z)
95 {
96  return vaddq_f16(x, vmulq_f16(y, z));
97 }
98 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
99 
100 template <unsigned int stridex>
101 float32x4_t internal_vld1q(const float *in);
102 
103 template <>
104 float32x4_t internal_vld1q<1>(const float *in)
105 {
106  return vld1q_f32(in);
107 }
108 
109 template <>
110 float32x4_t internal_vld1q<2>(const float *in)
111 {
112  const float32x4x2_t tmp = vld2q_f32(in);
113  return tmp.val[0];
114 }
115 
116 template <>
117 float32x4_t internal_vld1q<3>(const float *in)
118 {
119  const float32x4x3_t tmp = vld3q_f32(in);
120  return tmp.val[0];
121 }
122 
123 inline float32x4_t internal_vdupq_n(float v)
124 {
125  return vdupq_n_f32(v);
126 }
127 
128 inline void internal_vst1q(float *p, const float32x4_t &v)
129 {
130  vst1q_f32(p, v);
131 }
132 
133 float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y)
134 {
135  return vmulq_f32(x, y);
136 }
137 
138 inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z)
139 {
140  return vmlaq_f32(x, y, z);
141 }
142 
143 constexpr int small_tensor_size_optim = 8;
144 inline bool run_optim_small_tensor_info(const ITensorInfo *t)
145 {
146  return t->dimension(Window::DimX) <= small_tensor_size_optim && t->dimension(Window::DimY) <= small_tensor_size_optim;
147 }
148 
149 inline bool run_optim_small_tensor(const ITensor *t)
150 {
151  return run_optim_small_tensor_info(t->info());
152 }
153 
154 // Optimized convolver for 1x1 kernels used only where input width and height are both <= 8
155 // For big Z as in Input=7x7x832, this implementation is faster than the general code becuase it doesn't need to
156 // store intermidiate results in memory. Temporary results are stored in SIMD registers directly and then written to the output buffer.
157 template <unsigned int stridex>
158 class convolver_w1x1_i8x8_f32
159 {
160 public:
161  static void convolve(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
162  {
163  ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimX) > small_tensor_size_optim);
164  ARM_COMPUTE_ERROR_ON(src->info()->dimension(Window::DimY) > small_tensor_size_optim);
165 
166  const int input_stride_x = src->info()->strides_in_bytes().x();
167  const int input_stride_y = src->info()->strides_in_bytes().y();
168  const int input_stride_z = src->info()->strides_in_bytes().z();
169  const int output_stride_y = dst->info()->strides_in_bytes().y();
170  const int output_stride_z = dst->info()->strides_in_bytes().z();
171  const int kernel_stride_z = weights->info()->strides_in_bytes().z();
172  const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
173  const int output_h = dst->info()->dimension(1);
174  const int range_z = window.z().end() - window.z().start();
175  const int kernel_depth = weights->info()->dimension(Window::DimZ);
176  const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
177  const unsigned int conv_pad_left = conv_info.pad_left();
178  const unsigned int conv_pad_top = conv_info.pad_top();
179 
180  // setup output window for the iterator
181  Window window_out = window;
182  window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
183  window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
184  window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
185 
186  // setup input window for the iterator
187  Window window_in = window;
188  // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
189  window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
190  window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
191  window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
192 
193  Window window_k = calculate_max_window(*weights->info(), Steps(1u));
194  Iterator out(dst, window_out);
195  Iterator in(src, window_in);
196  Iterator k(weights, window_k);
197 
198  const uint8_t *k_ptr = k.ptr();
199 
200  execute_window_loop(window_out, [&](const Coordinates & id)
201  {
202  const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
203  uint8_t *out_ptr = out.ptr();
204  int ih = 0;
205  int oh = 0;
206  std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
207  std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
208  for(int oz = 0; oz < range_z; ++oz)
209  {
210  accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
211  accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
212  auto p_out_base = out_ptr + oz * output_stride_z;
213  for(int p = 0; p < kernel_depth; ++p)
214  {
215  const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
216  const auto vk0 = internal_vdupq_n(*k_val);
217  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
218  {
219  const int offset_xy = ih * input_stride_y;
220  auto in_val = reinterpret_cast<const float *>(input_ptr + p * input_stride_z + offset_xy);
221  auto v_in0 = internal_vld1q<stridex>(in_val);
222  auto v_in1 = internal_vld1q<stridex>(in_val + 4);
223  accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0);
224  accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1);
225  }
226  }
227  for(oh = 0; oh < output_h; ++oh)
228  {
229  auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
230  vst1q_f32(p_out, accum0[oh]);
231  vst1q_f32(p_out + 4, accum1[oh]);
232  }
233  }
234  },
235  in, out);
236  }
237 };
238 
239 template <typename T1, typename T2, unsigned int stridex>
240 class convolver_1x1
241 {
242 public:
243  static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
244  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
245  {
246  const int input_stride_x = src->info()->strides_in_bytes().x();
247  const int input_stride_y = src->info()->strides_in_bytes().y();
248  const int input_stride_z = src->info()->strides_in_bytes().z();
249  const int output_stride_y = dst->info()->strides_in_bytes().y();
250  const int output_stride_z = dst->info()->strides_in_bytes().z();
251  const int kernel_stride_z = weights->info()->strides_in_bytes().z();
252  const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
253  const int output_w = dst->info()->dimension(0);
254  const int output_h = dst->info()->dimension(1);
255  const int range_z = window.z().end() - window.z().start();
256  const int kernel_depth = weights->info()->dimension(Window::DimZ);
257  const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
258  const unsigned int conv_pad_left = conv_info.pad_left();
259  const unsigned int conv_pad_top = conv_info.pad_top();
260 
261  // setup output window for the iterator
262  Window window_out = window;
263  window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
264  window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
265  window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
266 
267  // setup input window for the iterator
268  Window window_in = window;
269  // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
270  window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
271  window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
272  window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
273 
274  Window window_k = calculate_max_window(*weights->info(), Steps(1u));
275  Iterator out(dst, window_out);
276  Iterator in(src, window_in);
277  Iterator k(weights, window_k);
278 
279  const uint8_t *k_ptr = k.ptr();
280 
281  execute_window_loop(window_out, [&](const Coordinates & id)
282  {
283  /*
284  For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1>
285  */
286  const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
287  uint8_t *out_ptr = out.ptr();
288  int ih = 0;
289  int oh = 0;
290  for(int oz = 0; oz < range_z; ++oz)
291  {
292  auto p_out_base = out_ptr + oz * output_stride_z;
293  // Step 1
294  {
295  const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
296  const auto vk = internal_vdupq_n(*k_val);
297  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
298  {
299  const int offset_xy = ih * input_stride_y;
300  auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 * input_stride_z + offset_xy));
301  auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
302  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
303  {
304  internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
305  }
306  }
307  }
308 
309  // Step 2
310  for(int p = 1; p < kernel_depth; ++p)
311  {
312  const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w);
313  const auto vk = internal_vdupq_n(*k_val);
314  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
315  {
316  const int offset_xy = ih * input_stride_y;
317  auto in_val = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + offset_xy);
318  auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
319  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
320  {
321  internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
322  }
323  }
324  }
325  }
326  },
327  in, out);
328  }
329 };
330 
331 template <unsigned int stridex>
332 float32x4x2_t convolve_5x5(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
333  const float *m0, const float *m1, const float *m2, const float *m3, const float *m4);
334 
335 inline float32x4x3_t load_matrix_hi(const float *const m0, const float *const m1, const float *const m2)
336 {
337  const float32x4x3_t m00 =
338  {
339  {
340  vld1q_dup_f32(m0),
341  vld1q_dup_f32(m1),
342  vld1q_dup_f32(m2)
343  }
344  };
345  return m00;
346 }
347 
348 inline float32x4x2_t load_matrix_lo(const float *const m3, const float *const m4)
349 {
350  const float32x4x2_t m00 =
351  {
352  {
353  vld1q_dup_f32(m3),
354  vld1q_dup_f32(m4)
355  }
356  };
357  return m00;
358 }
359 
360 inline float32x4x3_t load_input(const float *const in)
361 {
362  const float32x4x3_t vin =
363  {
364  {
365  vld1q_f32(in),
366  vld1q_f32(in + 4),
367  vld1q_f32(in + 8)
368  }
369  };
370  return vin;
371 }
372 
373 template <>
374 inline float32x4x2_t convolve_5x5<1>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
375  const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
376 {
377  const float32x4x3_t vin0 = load_input(in_0);
378  const float32x4x3_t vin1 = load_input(in_1);
379  const float32x4x3_t vin2 = load_input(in_2);
380  const float32x4x3_t vin3 = load_input(in_3);
381  const float32x4x3_t vin4 = load_input(in_4);
382  const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);
383  const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);
384  const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);
385  const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);
386  const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);
387  const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);
388  const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);
389  const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);
390  const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);
391  const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);
392 
393  float32x4x2_t out =
394  {
395  {
396  vmulq_f32(vin0.val[0], m00.val[0]),
397  vmulq_f32(vin0.val[1], m00.val[0])
398  }
399  };
400 
401  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
402  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
403  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
404  out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
405 
406  out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
407  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
408  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
409  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
410  out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
411 
412  out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
413  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
414  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
415  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
416  out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
417 
418  out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
419  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
420  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
421  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
422  out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
423 
424  out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
425  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
426  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
427  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
428  out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
429 
430  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
431  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
432  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
433  out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
434 
435  out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
436  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
437  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
438  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
439  out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
440 
441  out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
442  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
443  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
444  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
445  out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
446 
447  out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
448  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
449  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
450  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
451  out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
452 
453  out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
454  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
455  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
456  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
457  out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
458 
459  return out;
460 }
461 
462 template <>
463 inline float32x4x2_t convolve_5x5<2>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
464  const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
465 {
466  float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
467  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
468  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
469  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
470  return out;
471 }
472 
473 template <>
474 inline float32x4x2_t convolve_5x5<3>(const float *in_0, const float *in_1, const float *in_2, const float *in_3, const float *in_4,
475  const float *m0, const float *m1, const float *m2, const float *m3, const float *m4)
476 {
477  float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
478  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
479  return out;
480 }
481 
482 template <typename T1, typename T2, unsigned int stridex>
483 class convolver_3x3
484 {
485 public:
486  static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
487  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
488  {
489  ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
490  const int input_stride_x = src->info()->strides_in_bytes().x();
491  const int input_stride_y = src->info()->strides_in_bytes().y();
492  const int input_stride_z = src->info()->strides_in_bytes().z();
493  const int output_stride_y = dst->info()->strides_in_bytes().y();
494  const int output_stride_z = dst->info()->strides_in_bytes().z();
495  const int kernel_stride_x = weights->info()->strides_in_bytes().x();
496  const int kernel_stride_y = weights->info()->strides_in_bytes().y();
497  const int kernel_stride_z = weights->info()->strides_in_bytes().z();
498  const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
499  const int output_w = dst->info()->dimension(0);
500  const int output_h = dst->info()->dimension(1);
501  const int num_planes_z = window.z().end() - window.z().start();
502  const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
503  const int kernel_depth = weights->info()->dimension(Window::DimZ);
504  const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
505  const unsigned int conv_pad_left = conv_info.pad_left();
506  const unsigned int conv_pad_top = conv_info.pad_top();
507 
508  // setup output window for the iterator
509  Window window_out = window;
510  window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
511  window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
512  window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
513 
514  // setup input window for the iterator
515  Window window_in = window;
516  // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
517  window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
518  window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
519  window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
520 
521  Window window_k = calculate_max_window(*weights->info(), Steps(1u));
522 
523  Iterator out(dst, window_out);
524  Iterator in(src, window_in);
525  Iterator k(weights, window_k);
526 
527  const uint8_t *k_ptr = k.ptr();
528 
529  execute_window_loop(window_out, [&](const Coordinates & id)
530  {
531  const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
532  uint8_t *out_ptr = out.ptr();
533  int ih = 0;
534  int oh = 0;
535  /*
536  Each thread executing this kernel computes one or more output's volume planes.
537 
538  Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15],
539  the third thread [16,24] and the fourth thread [25,31].
540 
541  The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this
542  is that we setup the neon registers containing the kernel's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value.
543 
544  The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages:
545  1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values.
546  2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1.
547  */
548  for(int oz = 0; oz < num_planes_z; ++oz)
549  {
550  const int zoffset = id.z() + oz;
551  uint8_t *p_out_base = out_ptr + oz * output_stride_z;
552  // Step 1
553  {
554  const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
555  const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
556  const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
557  const auto vk_r0 = load_matrix_row(ptr_k_r0);
558  const auto vk_r1 = load_matrix_row(ptr_k_r1);
559  const auto vk_r2 = load_matrix_row(ptr_k_r2);
560  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
561  {
562  auto in_top = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
563  auto in_mid = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
564  auto in_low = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
565  auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
566  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
567  in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
568  {
569  convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
570  }
571  }
572  }
573  // Step 2
574  for(int p = 1; p < kernel_depth; ++p)
575  {
576  const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
577  const uint8_t *input_base = input_ptr + p * input_stride_z;
578  const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);
579  const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
580  const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
581  const auto vk_r0 = load_matrix_row(ptr_k_r0);
582  const auto vk_r1 = load_matrix_row(ptr_k_r1);
583  const auto vk_r2 = load_matrix_row(ptr_k_r2);
584  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
585  {
586  auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) * input_stride_y);
587  auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) * input_stride_y);
588  auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) * input_stride_y);
589  auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
590  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
591  in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
592  {
593  convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
594  }
595  }
596  }
597  }
598  },
599  in, out);
600  }
601 };
602 
603 template <typename T1, typename T2, unsigned int stridex>
604 class convolver_5x5
605 {
606 public:
607  static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
608  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
609  {
610  ARM_COMPUTE_UNUSED(num_elems_read_per_iteration);
611  const int input_stride_x = src->info()->strides_in_bytes().x();
612  const int input_stride_y = src->info()->strides_in_bytes().y();
613  const int input_stride_z = src->info()->strides_in_bytes().z();
614  const int output_stride_y = dst->info()->strides_in_bytes().y();
615  const int output_stride_z = dst->info()->strides_in_bytes().z();
616  const int kernel_stride_x = weights->info()->strides_in_bytes().x();
617  const int kernel_stride_y = weights->info()->strides_in_bytes().y();
618  const int kernel_stride_z = weights->info()->strides_in_bytes().z();
619  const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
620  const int output_w = dst->info()->dimension(0);
621  const int output_h = dst->info()->dimension(1);
622  const int num_planes_z = window.z().end() - window.z().start();
623  const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration, stridex);
624  const int kernel_depth = weights->info()->dimension(Window::DimZ);
625  const unsigned int conv_stride_y = std::get<1>(conv_info.stride());
626  const unsigned int conv_pad_left = conv_info.pad_left();
627  const unsigned int conv_pad_top = conv_info.pad_top();
628 
629  // setup output window for the iterator
630  Window window_out = window;
631  window_out.set(Window::DimX, Window::Dimension(0, dst->info()->dimension(Window::DimX), dst->info()->dimension(Window::DimX)));
632  window_out.set(Window::DimY, Window::Dimension(0, dst->info()->dimension(Window::DimY), dst->info()->dimension(Window::DimY)));
633  window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
634 
635  // setup input window for the iterator
636  Window window_in = window;
637  // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0
638  window_in.set(Window::DimX, Window::Dimension(0, 0, 0));
639  window_in.set(Window::DimY, Window::Dimension(0, 0, 0));
640  window_in.set(Window::DimZ, Window::Dimension(0, 0, 0));
641 
642  Window window_k = calculate_max_window(*weights->info(), Steps(1u));
643 
644  Iterator out(dst, window_out);
645  Iterator in(src, window_in);
646  Iterator k(weights, window_k);
647 
648  const uint8_t *k_ptr = k.ptr();
649 
650  execute_window_loop(window_out, [&](const Coordinates & id)
651  {
652  const uint8_t *input_ptr = in.ptr() - conv_pad_left * input_stride_x - conv_pad_top * input_stride_y;
653  uint8_t *out_ptr = out.ptr();
654  int ih = 0;
655  int oh = 0;
656  for(int oz = 0; oz < num_planes_z; ++oz)
657  {
658  const int zoffset = id.z() + oz;
659  uint8_t *p_out_base = out_ptr + oz * output_stride_z;
660  // Step 1
661  {
662  const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
663  const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
664  const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
665  const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
666  const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
667  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
668  {
669  auto in_0 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y);
670  auto in_1 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y);
671  auto in_2 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y);
672  auto in_3 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 3) * input_stride_y);
673  auto in_4 = reinterpret_cast<const T1 *>(input_ptr + 0 * input_stride_z + (ih + 4) * input_stride_y);
674  auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
675  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
676  in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
677  {
678  auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
679  store_results<stridex>(p_out, vres);
680  }
681  }
682  }
683  // Step 2
684  for(int p = 1; p < kernel_depth; ++p)
685  {
686  const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
687  const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
688  const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
689  const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
690  const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
691 
692  for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y)
693  {
694  auto in_0 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y);
695  auto in_1 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y);
696  auto in_2 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y);
697  auto in_3 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 3) * input_stride_y);
698  auto in_4 = reinterpret_cast<const T1 *>(input_ptr + p * input_stride_z + (ih + 4) * input_stride_y);
699  auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
700  for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
701  in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
702  {
703  auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
704  accumulate_results<stridex>(p_out, vres);
705  }
706  }
707  }
708  }
709  },
710  in, out);
711  }
712 };
713 
714 template <typename T1, typename T2>
715 inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
716  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
717 {
718  const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
719  switch(conv_stride_x)
720  {
721  case 1:
722  convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
723  break;
724  case 2:
725  convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
726  break;
727  case 3:
728  convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
729  break;
730  default:
731  ARM_COMPUTE_ERROR("Not implemented");
732  }
733 }
734 
735 template <>
736 inline void convolve_1x1<float, float>(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
737  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
738 {
739  const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
740  if(run_optim_small_tensor(src))
741  {
742  switch(conv_stride_x)
743  {
744  case 1:
745  convolver_w1x1_i8x8_f32<1>::convolve(window, src, weights, dst, conv_info);
746  break;
747  case 2:
748  convolver_w1x1_i8x8_f32<2>::convolve(window, src, weights, dst, conv_info);
749  break;
750  case 3:
751  convolver_w1x1_i8x8_f32<3>::convolve(window, src, weights, dst, conv_info);
752  break;
753  default:
754  ARM_COMPUTE_ERROR("Not implemented");
755  }
756  }
757  else
758  {
759  switch(conv_stride_x)
760  {
761  case 1:
762  convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
763  break;
764  case 2:
765  convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
766  break;
767  case 3:
768  convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
769  break;
770  default:
771  ARM_COMPUTE_ERROR("Not implemented");
772  }
773  }
774 }
775 
776 template <typename T1, typename T2>
777 inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
778  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
779 {
780  const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
781  switch(conv_stride_x)
782  {
783  case 1:
784  convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
785  break;
786  case 2:
787  convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
788  break;
789  case 3:
790  convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
791  break;
792  default:
793  ARM_COMPUTE_ERROR("Not implemented");
794  }
795 }
796 
797 template <typename T1, typename T2>
798 inline void convolve_5x5(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration,
799  const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info)
800 {
801  const unsigned int conv_stride_x = std::get<0>(conv_info.stride());
802  switch(conv_stride_x)
803  {
804  case 1:
805  convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
806  break;
807  case 2:
808  convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
809  break;
810  case 3:
811  convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, src, weights, dst, conv_info);
812  break;
813  default:
814  ARM_COMPUTE_ERROR("Not implemented");
815  }
816 }
817 
818 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
819 {
820  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
821  ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
825 
826  const DataLayout data_layout = src->data_layout();
827  const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
828  const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
829  const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
830 
831  ARM_COMPUTE_RETURN_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported.");
832  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(channel_idx) != src->dimension(channel_idx));
833  ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
834  ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
835  ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && src->data_type() != DataType::F32);
836  ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (src->data_type() == DataType::F16));
837 
838  // Checks performed when output is configured
839  if(dst->total_size() != 0)
840  {
841  TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
842 
843  DataType data_type = src->data_type();
844 
846  ARM_COMPUTE_RETURN_ERROR_ON(dst->data_type() != data_type);
847  }
848 
849  return Status{};
850 }
851 
852 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, unsigned int &num_weight_elems_read_per_row,
853  unsigned int &num_elems_read_per_iteration, unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
854 {
855  ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN);
856 
857  const DataLayout data_layout = src->data_layout();
858  const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
859 
860  // Calculate right and bottom border
861  unsigned int kernel_size = weights->dimension(width_idx);
862  const int conv_stride_x = std::get<0>(conv_info.stride());
863  const int conv_stride_y = std::get<1>(conv_info.stride());
864  const int input_width = src->dimension(width_idx);
865 
866  Window win{};
867  bool window_changed = false;
868 
869  if(data_layout == DataLayout::NCHW)
870  {
871  switch(kernel_size)
872  {
873  case 1:
874  {
875  switch(src->data_type())
876  {
877 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
878  case DataType::F16:
879  num_elems_written_per_iteration = 8;
880  break;
881 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
882  case DataType::F32:
883  if(run_optim_small_tensor_info(src))
884  {
885  num_elems_written_per_iteration = 8;
886  }
887  else
888  {
889  num_elems_written_per_iteration = 4;
890  }
891  break;
892  default:
893  ARM_COMPUTE_ERROR("Data type not supported.");
894  break;
895  }
896  num_weight_elems_read_per_row = kernel_size;
897  num_elems_read_per_iteration = conv_stride_x * num_elems_written_per_iteration;
898  break;
899  }
900  case 3:
901  switch(src->data_type())
902  {
903  case DataType::F32:
904  num_weight_elems_read_per_row = 4 + kernel_size - 1;
905  num_elems_read_per_iteration = 12;
906  num_elems_written_per_iteration = 16 >> conv_stride_x;
907  break;
908 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
909  case DataType::F16:
910  num_weight_elems_read_per_row = 8 + kernel_size - 1;
911  num_elems_read_per_iteration = 24;
912  num_elems_written_per_iteration = 32 >> conv_stride_x;
913  break;
914 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
915  default:
916  ARM_COMPUTE_ERROR("Data type not supported.");
917  break;
918  }
919  break;
920  case 5:
921  {
922  switch(src->data_type())
923  {
924  case DataType::F32:
925  num_weight_elems_read_per_row = 4 + kernel_size - 1;
926  num_elems_read_per_iteration = 12;
927  num_elems_written_per_iteration = 16 >> conv_stride_x;
928  break;
929  default:
930  ARM_COMPUTE_ERROR("Data type not supported.");
931  break;
932  }
933  }
934  break;
935  default:
936  {
937  ARM_COMPUTE_ERROR("Not implemented");
938  break;
939  }
940  }
941 
942  // Calculate right pad
943  int start_x = kernel_size / 2 - static_cast<int>(conv_info.pad_left());
944  int end_x = ceil_to_multiple(static_cast<int>(dst->dimension(0)), num_elems_written_per_iteration) * conv_stride_x;
945  int upper_bound_w = ceil_to_multiple(start_x + end_x, num_elems_read_per_iteration) - input_width;
946 
947  // Calculate border
948  const unsigned int conv_pad_left = conv_info.pad_left();
949  const unsigned int conv_pad_top = conv_info.pad_top();
950  const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
951  const unsigned int conv_pad_bottom = conv_info.pad_bottom();
952 
953  border_size.left = conv_pad_left;
954  border_size.top = conv_pad_top;
955  border_size.right = conv_pad_right;
956  border_size.bottom = conv_pad_bottom;
957 
958  // Configure window
959  win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration));
960 
961  AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top,
962  num_elems_read_per_iteration, kernel_size,
963  conv_stride_x, conv_stride_y);
964  AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
965  AccessWindowHorizontal output_access(dst, 0, num_elems_written_per_iteration);
966  window_changed = update_window_and_padding(win, input_access, weights_access, output_access);
967  output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape()));
968  }
969  else
970  {
971  // Configure window NHWC without any padding
972  win = calculate_max_window(*dst, Steps());
973  }
974 
975  Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
976  return std::make_pair(err, win);
977 }
978 
979 bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights)
980 {
981  return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && weights->padding().right == 0);
982 }
983 
984 } // namespace
985 
986 template <typename T>
987 void CpuDirectConv2dKernel::convolve_nhwc_optimized(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
988 {
989  // This function assumes that input and weights have not padding in channel
990 
991  // Declare useful types
992  using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
993  using vector_type = typename vtype::type;
994  using tag_type = typename vtype::tag_type;
995 
996  // Scalar quantities
997  const int element_size = src->info()->element_size();
998  const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
999  const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
1000  const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
1001  const int input_dim_w = src->info()->dimension(1);
1002  const int input_dim_h = src->info()->dimension(2);
1003 
1004  const int output_stride_c = dst->info()->strides_in_bytes().x();
1005 
1006  const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
1007  const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
1008  const int kernel_dim_w = weights->info()->dimension(1);
1009  const int kernel_dim_h = weights->info()->dimension(2);
1010 
1011  const int conv_pad_top = _conv_info.pad_top();
1012  const int conv_pad_left = _conv_info.pad_left();
1013  const int conv_stride_w = std::get<0>(_conv_info.stride());
1014  const int conv_stride_h = std::get<1>(_conv_info.stride());
1015 
1016  // Setup input window for the output iterator
1017  Window window_out = window;
1018  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1019 
1020  // Setup input window for the weights iterator
1021  Window window_w = calculate_max_window(*weights->info(), Steps());
1022  window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
1023  window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
1024  window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
1025 
1026  Iterator out(dst, window_out);
1027  Iterator wei(weights, window_w);
1028 
1029  constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
1030  /*
1031  * This implementation parallelize the full WC plane of input and weights by
1032  * treating them as series of elements. So for example, a 3x3 weights and
1033  * floating point vector operations of 4 elements per time, the first 3
1034  * channel elements of the first row would be taken and additionally the first
1035  * element of the second row. The 9 elements in each single WC weight plane
1036  * would require 2 4-element vector operations and a last single element operation.
1037  *
1038  * This works since when we create the input vector to multiply with the weights,
1039  * the exact required elements are loaded in the same order. Therefore the
1040  * multiplication works on the correct input/weight elements.
1041  */
1042  execute_window_loop(window_out, [&](const Coordinates & id)
1043  {
1044  /*
1045  * In here we create theoretical indexes which then we validate for both
1046  * inputs and weights.
1047  * As a reminder, this loop take each output point in NHW, C is treated
1048  * in the weights loop.
1049  */
1050  // We are computing the theoretical starting input starting points
1051  const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
1052  const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
1053  const int in_w_end_t = in_w_start_t + kernel_dim_w;
1054  const int in_h_end_t = in_h_start_t + kernel_dim_h;
1055 
1056  // We are computing the valid initial and ending input points by checking the borders
1057  const int in_w_start = std::max(in_w_start_t, 0);
1058  const int in_h_start = std::max(in_h_start_t, 0);
1059  const int in_w_end = std::min(in_w_end_t, input_dim_w);
1060  const int in_h_end = std::min(in_h_end_t, input_dim_h);
1061 
1062  // We use the input points to select the valid weight points to use
1063  const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
1064  const int index_h_start = in_h_start - in_h_start_t;
1065  const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
1066  const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1067 
1068  execute_window_loop(window_w, [&](const Coordinates & id_w)
1069  {
1070  /*
1071  * This is the loop in the weights, and it goes along N (the batches)
1072  * As a reminder, the batches of the weights are translated into the
1073  * channels of the output
1074  */
1075  const T *in_ptr_row = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes())
1076  + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
1077  const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
1078  uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1079 
1080  T out_temp = static_cast<T>(0);
1081  for(int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
1082  {
1083  const T *in_ptr_mover = in_ptr_row;
1084  int index_wc = index_wc_start;
1085  vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
1086  for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
1087  {
1088  const auto src_vec = wrapper::vloadq(in_ptr_mover);
1089  const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc);
1090  out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
1091  }
1092  out_temp += vreduce(out_temp_vec);
1093  for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
1094  {
1095  const auto src_val = *(in_ptr_mover);
1096  const auto w_val = *(weights_ptr_row + index_wc);
1097  out_temp += src_val * w_val;
1098  }
1099  }
1100  *(reinterpret_cast<T *>(out_ptr)) = out_temp;
1101  },
1102  wei);
1103  },
1104  out);
1105 }
1106 
1107 template <typename T>
1108 void CpuDirectConv2dKernel::convolve_nhwc(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst)
1109 {
1110  // Declare useful types
1111  using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1112  using vector_type = typename vtype::type;
1113  using tag_type = typename vtype::tag_type;
1114 
1115  // Scalar quantities
1116  const int element_size = src->info()->element_size();
1117  const int input_stride_w = src->info()->strides_in_bytes().y() / element_size;
1118  const int input_stride_h = src->info()->strides_in_bytes().z() / element_size;
1119  const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size;
1120  const int input_dim_w = src->info()->dimension(1);
1121  const int input_dim_h = src->info()->dimension(2);
1122 
1123  const int output_stride_c = dst->info()->strides_in_bytes().x();
1124 
1125  const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
1126  const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
1127  const int kernel_dim_w = weights->info()->dimension(1);
1128  const int kernel_dim_h = weights->info()->dimension(2);
1129 
1130  const int conv_pad_top = _conv_info.pad_top();
1131  const int conv_pad_left = _conv_info.pad_left();
1132  const int conv_stride_w = std::get<0>(_conv_info.stride());
1133  const int conv_stride_h = std::get<1>(_conv_info.stride());
1134 
1135  // Setup input window for the output iterator
1136  Window window_out = window;
1137  window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
1138 
1139  // Setup input window for the weights iterator
1140  Window window_w = calculate_max_window(*weights->info(), Steps());
1141  window_w.set(Window::DimX, Window::Dimension(0, 1, 1));
1142  window_w.set(Window::DimY, Window::Dimension(0, 1, 1));
1143  window_w.set(Window::DimZ, Window::Dimension(0, 1, 1));
1144 
1145  Iterator out(dst, window_out);
1146  Iterator wei(weights, window_w);
1147 
1148  constexpr int num_elems_read_per_iteration = 16 / sizeof(T);
1149 
1150  execute_window_loop(window_out, [&](const Coordinates & id)
1151  {
1152  // We are computing the theoretical starting input starting points
1153  const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left;
1154  const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top;
1155  const int in_w_end_t = in_w_start_t + kernel_dim_w;
1156  const int in_h_end_t = in_h_start_t + kernel_dim_h;
1157 
1158  // We are computing the valid initial and ending input points by checking the borders
1159  const int in_w_start = std::max(in_w_start_t, 0);
1160  const int in_h_start = std::max(in_h_start_t, 0);
1161  const int in_w_end = std::min(in_w_end_t, input_dim_w);
1162  const int in_h_end = std::min(in_h_end_t, input_dim_h);
1163 
1164  // We use the input points to select the valid weight points to use
1165  const int wei_w_start = in_w_start - in_w_start_t;
1166  const int wei_h_start = in_h_start - in_h_start_t;
1167  const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
1168  const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1169 
1170  const int index_c_end = weights->info()->dimension(0);
1171  const T *const in_ptr_start = reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + id[3] * input_stride_n;
1172 
1173  execute_window_loop(window_w, [&](const Coordinates & id_w)
1174  {
1175  const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
1176  uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1177 
1178  T out_temp = static_cast<T>(0);
1179  for(int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
1180  {
1181  const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
1182  const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
1183  for(int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
1184  {
1185  const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
1186  const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
1187  int index_c = 0;
1188  vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type());
1189  for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
1190  {
1191  const auto src_vec = wrapper::vloadq(in_ptr_mover);
1192  const auto w_vec = wrapper::vloadq(weights_ptr_mover);
1193  out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec);
1194  }
1195  out_temp += vreduce(out_temp_vec);
1196  for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
1197  {
1198  const auto src_val = *(in_ptr_mover);
1199  const auto w_val = *(weights_ptr_mover);
1200  out_temp += src_val * w_val;
1201  }
1202  }
1203  }
1204  *(reinterpret_cast<T *>(out_ptr)) = out_temp;
1205  },
1206  wei);
1207  },
1208  out);
1209 }
1210 
1211 BorderSize CpuDirectConv2dKernel::border_size() const
1212 {
1213  return _border_size;
1214 }
1215 
1217 {
1218  ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
1219 
1220  _conv_info = conv_info;
1221  _data_layout = src->data_layout();
1222  _kernel_size = weights->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
1223 
1224  const unsigned int conv_pad_left = conv_info.pad_left();
1225  const unsigned int conv_pad_top = conv_info.pad_top();
1226  const unsigned int conv_pad_right = conv_info.pad_right();
1227  const unsigned int conv_pad_bottom = conv_info.pad_bottom();
1228  if(_data_layout == DataLayout::NCHW)
1229  {
1230  _border_size = BorderSize(conv_pad_top, conv_pad_right, conv_pad_bottom, conv_pad_left);
1231  }
1232  else
1233  {
1234  _border_size = BorderSize(0);
1235  }
1236 
1237  // Get convolved dimensions
1238  TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info);
1239 
1240  DataType data_type = src->data_type();
1241 
1242  // Output auto inizialitation if not yet initialized
1243  auto_init_if_empty(*dst, output_shape, 1, data_type);
1244 
1245  // Perform validation step
1246  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info));
1247 
1248  // Configure kernel window
1249  auto win_config = validate_and_configure_window(src, weights, dst, conv_info, _num_weight_elems_read_per_row,
1250  _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
1251  ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
1252  ICpuKernel::configure(win_config.second);
1253 }
1254 
1255 Status CpuDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
1256 {
1257  unsigned int num_weight_elems_read_per_row = 0;
1258  unsigned int num_elems_read_per_iteration = 0;
1259  unsigned int num_elems_written_per_iteration = 0;
1260  BorderSize border_size = {};
1261  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info));
1262  ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(),
1263  weights->clone().get(),
1264  dst->clone().get(),
1265  conv_info,
1266  num_weight_elems_read_per_row,
1267  num_elems_read_per_iteration,
1268  num_elems_written_per_iteration,
1269  border_size)
1270  .first);
1271 
1272  return Status{};
1273 }
1274 
1275 void CpuDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
1276 {
1277  ARM_COMPUTE_UNUSED(info);
1279  ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
1280 
1281  auto src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
1282  auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
1283  auto dst = tensors.get_tensor(TensorType::ACL_DST);
1284  const int kernel_size = weights->info()->dimension(get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH));
1285 
1286  if(_data_layout == DataLayout::NCHW)
1287  {
1288  switch(kernel_size)
1289  {
1290  case 1:
1291  {
1292  switch(src->info()->data_type())
1293  {
1294  case DataType::F32:
1295  convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
1296  break;
1297 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1298  case DataType::F16:
1299  convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
1300  break;
1301 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1302  default:
1303  ARM_COMPUTE_ERROR("Data type not supported");
1304  break;
1305  }
1306  break;
1307  }
1308  case 3:
1309  {
1310  switch(src->info()->data_type())
1311  {
1312  case DataType::F32:
1313  convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
1314  break;
1315 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1316  case DataType::F16:
1317  convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
1318  break;
1319 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1320  default:
1321  ARM_COMPUTE_ERROR("Data type not supported");
1322  break;
1323  }
1324  break;
1325  }
1326  case 5:
1327  {
1328  switch(src->info()->data_type())
1329  {
1330  case DataType::F32:
1331  convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, src, weights, dst, _conv_info);
1332  break;
1333  default:
1334  ARM_COMPUTE_ERROR("Data type not supported");
1335  break;
1336  }
1337  break;
1338  }
1339  default:
1340  {
1341  ARM_COMPUTE_ERROR("Only kernel sizes 1x1, 3x3 and 5x5 are supported.");
1342  break;
1343  }
1344  }
1345  }
1346  else
1347  {
1348  switch(src->info()->data_type())
1349  {
1350  case DataType::F32:
1351  {
1352  if(have_zero_x_internal_padding(src->info(), weights->info()))
1353  {
1354  convolve_nhwc_optimized<float>(window, src, weights, dst);
1355  }
1356  else
1357  {
1358  convolve_nhwc<float>(window, src, weights, dst);
1359  }
1360  break;
1361  }
1362  default:
1363  ARM_COMPUTE_ERROR("Data type not supported");
1364  break;
1365  }
1366  }
1367 }
1368 const char *CpuDirectConv2dKernel::name() const
1369 {
1370  return "CpuDirectConvolutionLayerKernel";
1371 }
1372 } // namespace kernels
1373 } // namespace cpu
1374 } // namespace arm_compute
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Shape of a tensor.
Definition: TensorShape.h:39
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
Definition: clang-tidy.h:78
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
const size_t conv_pad_left
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
const size_t input_stride_y
Container for 2D border size.
Definition: Types.h:269
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
float16x8_t vaddq_f16(float16x8_t, float16x8_t)
Definition: clang-tidy.h:68
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
unsigned int pad_top() const
Get the top padding.
Definition: Types.h:740
const size_t input_stride_z
Status class.
Definition: Error.h:52
int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
decltype(strategy::transforms) typedef type
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
Definition: Validate.h:284
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:159
const DataType data_type
Definition: Im2Col.cpp:150
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
const size_t input_width
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Definition: Utils.h:71
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
unsigned int pad_right() const
Get the right padding.
Definition: Types.h:735
Padding and stride information class.
Definition: Types.h:656
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
const char * name
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
const size_t conv_stride_x
Num samples, channels, height, width.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Information about executing thread and CPU.
Definition: CPPTypes.h:158
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
Definition: Error.h:159
float vreduce(const float32x4_t &v)
Reduce a vector to be a scalar by accumulating all lanes in the vector.
Definition: NEMath.inl:421
const size_t conv_pad_top
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
Definition: Window.h:47
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:541
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
float32x4x3_t load_matrix_row(const float *ptr)
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
im2col_func configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias)
unsigned int pad_bottom() const
Get the bottom padding.
Definition: Types.h:745
const size_t conv_stride_y
DataType
Available data types.
Definition: Types.h:79
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:730
DataLayout
[DataLayout enum definition]
Definition: Types.h:113
Describe a multidimensional execution window.
Definition: Window.h:39
TensorShape compute_deep_convolution_shape(const TensorShape &input_shape, DataLayout input_data_layout, const TensorShape &weights_shape, const PadStrideInfo &conv_info)
Calculate the deep convolution shape output shape of a tensor.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201
Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.