55 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 56 template <
unsigned int str
idex>
57 float16x8_t internal_vld1q(
const float16_t *in);
60 float16x8_t internal_vld1q<1>(
const float16_t *in)
66 float16x8_t internal_vld1q<2>(
const float16_t *in)
68 const float16x8x2_t tmp = vld2q_f16(in);
73 float16x8_t internal_vld1q<3>(
const float16_t *in)
75 const float16x8x3_t tmp = vld3q_f16(in);
79 inline float16x8_t internal_vdupq_n(float16_t v)
81 return vdupq_n_f16(v);
84 inline void internal_vst1q(float16_t *p,
const float16x8_t &v)
89 float16x8_t internal_vmull(
const float16x8_t &x,
const float16x8_t &y)
94 inline float16x8_t internal_vmlal(
const float16x8_t &x,
const float16x8_t &y,
const float16x8_t &z)
100 template <
unsigned int str
idex>
101 float32x4_t internal_vld1q(
const float *in);
104 float32x4_t internal_vld1q<1>(
const float *in)
106 return vld1q_f32(in);
110 float32x4_t internal_vld1q<2>(
const float *in)
112 const float32x4x2_t tmp = vld2q_f32(in);
117 float32x4_t internal_vld1q<3>(
const float *in)
119 const float32x4x3_t tmp = vld3q_f32(in);
123 inline float32x4_t internal_vdupq_n(
float v)
125 return vdupq_n_f32(v);
128 inline void internal_vst1q(
float *p,
const float32x4_t &v)
133 float32x4_t internal_vmull(
const float32x4_t &x,
const float32x4_t &y)
135 return vmulq_f32(x, y);
138 inline float32x4_t internal_vmlal(
const float32x4_t &x,
const float32x4_t &y,
const float32x4_t &z)
140 return vmlaq_f32(x, y, z);
143 constexpr
int small_tensor_size_optim = 8;
144 inline bool run_optim_small_tensor_info(
const ITensorInfo *
t)
149 inline bool run_optim_small_tensor(
const ITensor *
t)
151 return run_optim_small_tensor_info(
t->info());
157 template <
unsigned int str
idex>
158 class convolver_w1x1_i8x8_f32
161 static void convolve(
const Window &window,
const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
166 const int input_stride_x =
src->info()->strides_in_bytes().x();
169 const int output_stride_y =
dst->info()->strides_in_bytes().y();
170 const int output_stride_z =
dst->info()->strides_in_bytes().z();
171 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
172 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
173 const int output_h =
dst->info()->dimension(1);
174 const int range_z = window.z().end() - window.z().start();
175 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
181 Window window_out = window;
184 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
187 Window window_in = window;
189 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
190 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
191 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
194 Iterator out(
dst, window_out);
195 Iterator in(
src, window_in);
196 Iterator k(weights, window_k);
198 const uint8_t *k_ptr = k.ptr();
203 uint8_t *out_ptr = out.ptr();
206 std::array<float32x4_t, 8> accum0 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
207 std::array<float32x4_t, 8> accum1 = { vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0), vdupq_n_f32(0) };
208 for(
int oz = 0; oz < range_z; ++oz)
210 accum0[0] = accum0[1] = accum0[2] = accum0[3] = accum0[4] = accum0[5] = accum0[6] = accum0[7] = vdupq_n_f32(0.f);
211 accum1[0] = accum1[1] = accum1[2] = accum1[3] = accum1[4] = accum1[5] = accum1[6] = accum1[7] = vdupq_n_f32(0.f);
212 auto p_out_base = out_ptr + oz * output_stride_z;
213 for(
int p = 0; p < kernel_depth; ++p)
215 const auto k_val = reinterpret_cast<const float *>(k_ptr + p * kernel_stride_z + (
id.z() + oz) * kernel_stride_w);
216 const auto vk0 = internal_vdupq_n(*k_val);
217 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
220 auto in_val = reinterpret_cast<const float *>(input_ptr + p *
input_stride_z + offset_xy);
221 auto v_in0 = internal_vld1q<stridex>(in_val);
222 auto v_in1 = internal_vld1q<stridex>(in_val + 4);
223 accum0[oh] = vmlaq_f32(accum0[oh], vk0, v_in0);
224 accum1[oh] = vmlaq_f32(accum1[oh], vk0, v_in1);
227 for(oh = 0; oh < output_h; ++oh)
229 auto p_out = reinterpret_cast<float *>(p_out_base + oh * output_stride_y);
230 vst1q_f32(p_out, accum0[oh]);
231 vst1q_f32(p_out + 4, accum1[oh]);
239 template <
typename T1,
typename T2,
unsigned int str
idex>
243 static void convolve(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
244 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
246 const int input_stride_x =
src->info()->strides_in_bytes().x();
249 const int output_stride_y =
dst->info()->strides_in_bytes().y();
250 const int output_stride_z =
dst->info()->strides_in_bytes().z();
251 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
252 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
253 const int output_w =
dst->info()->dimension(0);
254 const int output_h =
dst->info()->dimension(1);
255 const int range_z = window.z().end() - window.z().start();
256 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
262 Window window_out = window;
265 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z));
268 Window window_in = window;
270 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
271 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
272 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
275 Iterator out(
dst, window_out);
276 Iterator in(
src, window_in);
277 Iterator k(weights, window_k);
279 const uint8_t *k_ptr = k.ptr();
287 uint8_t *out_ptr = out.ptr();
290 for(
int oz = 0; oz < range_z; ++oz)
292 auto p_out_base = out_ptr + oz * output_stride_z;
295 const auto k_val = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + (
id.z() + oz) * kernel_stride_w);
296 const auto vk = internal_vdupq_n(*k_val);
297 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
300 auto in_val = reinterpret_cast<const T1 *>(input_ptr + (0 *
input_stride_z + offset_xy));
301 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
302 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
304 internal_vst1q(p_out, internal_vmull(vk, internal_vld1q<stridex>(in_val)));
310 for(
int p = 1; p < kernel_depth; ++p)
312 const auto k_val = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + (
id.z() + oz) * kernel_stride_w);
313 const auto vk = internal_vdupq_n(*k_val);
314 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
317 auto in_val = reinterpret_cast<const T1 *>(input_ptr + p *
input_stride_z + offset_xy);
318 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
319 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration)
321 internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q<stridex>(in_val)));
331 template <
unsigned int str
idex>
332 float32x4x2_t convolve_5x5(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
333 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4);
335 inline float32x4x3_t load_matrix_hi(
const float *
const m0,
const float *
const m1,
const float *
const m2)
337 const float32x4x3_t m00 =
348 inline float32x4x2_t load_matrix_lo(
const float *
const m3,
const float *
const m4)
350 const float32x4x2_t m00 =
360 inline float32x4x3_t load_input(
const float *
const in)
362 const float32x4x3_t vin =
374 inline float32x4x2_t convolve_5x5<1>(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
375 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4)
377 const float32x4x3_t vin0 = load_input(in_0);
378 const float32x4x3_t vin1 = load_input(in_1);
379 const float32x4x3_t vin2 = load_input(in_2);
380 const float32x4x3_t vin3 = load_input(in_3);
381 const float32x4x3_t vin4 = load_input(in_4);
382 const float32x4x3_t m00 = load_matrix_hi(m0, 1 + m0, 2 + m0);
383 const float32x4x2_t m01 = load_matrix_lo(3 + m0, 4 + m0);
384 const float32x4x3_t m10 = load_matrix_hi(m1, 1 + m1, 2 + m1);
385 const float32x4x2_t m11 = load_matrix_lo(3 + m1, 4 + m1);
386 const float32x4x3_t m20 = load_matrix_hi(m2, 1 + m2, 2 + m2);
387 const float32x4x2_t m21 = load_matrix_lo(3 + m2, 4 + m2);
388 const float32x4x3_t m30 = load_matrix_hi(m3, 1 + m3, 2 + m3);
389 const float32x4x2_t m31 = load_matrix_lo(3 + m3, 4 + m3);
390 const float32x4x3_t m40 = load_matrix_hi(m4, 1 + m4, 2 + m4);
391 const float32x4x2_t m41 = load_matrix_lo(3 + m4, 4 + m4);
396 vmulq_f32(vin0.val[0], m00.val[0]),
397 vmulq_f32(vin0.val[1], m00.val[0])
401 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 1), m00.val[1]);
402 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 2), m00.val[2]);
403 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin0.val[0], vin0.val[1], 3), m01.val[0]);
404 out.val[0] = vmlaq_f32(out.val[0], vin0.val[1], m01.val[1]);
406 out.val[0] = vmlaq_f32(out.val[0], vin1.val[0], m10.val[0]);
407 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 1), m10.val[1]);
408 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 2), m10.val[2]);
409 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin1.val[0], vin1.val[1], 3), m11.val[0]);
410 out.val[0] = vmlaq_f32(out.val[0], vin1.val[1], m11.val[1]);
412 out.val[0] = vmlaq_f32(out.val[0], vin2.val[0], m20.val[0]);
413 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 1), m20.val[1]);
414 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 2), m20.val[2]);
415 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin2.val[0], vin2.val[1], 3), m21.val[0]);
416 out.val[0] = vmlaq_f32(out.val[0], vin2.val[1], m21.val[1]);
418 out.val[0] = vmlaq_f32(out.val[0], vin3.val[0], m30.val[0]);
419 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 1), m30.val[1]);
420 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 2), m30.val[2]);
421 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin3.val[0], vin3.val[1], 3), m31.val[0]);
422 out.val[0] = vmlaq_f32(out.val[0], vin3.val[1], m31.val[1]);
424 out.val[0] = vmlaq_f32(out.val[0], vin4.val[0], m40.val[0]);
425 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 1), m40.val[1]);
426 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 2), m40.val[2]);
427 out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vin4.val[0], vin4.val[1], 3), m41.val[0]);
428 out.val[0] = vmlaq_f32(out.val[0], vin4.val[1], m41.val[1]);
430 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 1), m00.val[1]);
431 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 2), m00.val[2]);
432 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin0.val[1], vin0.val[2], 3), m01.val[0]);
433 out.val[1] = vmlaq_f32(out.val[1], vin0.val[2], m01.val[1]);
435 out.val[1] = vmlaq_f32(out.val[1], vin1.val[1], m10.val[0]);
436 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 1), m10.val[1]);
437 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 2), m10.val[2]);
438 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin1.val[1], vin1.val[2], 3), m11.val[0]);
439 out.val[1] = vmlaq_f32(out.val[1], vin1.val[2], m11.val[1]);
441 out.val[1] = vmlaq_f32(out.val[1], vin2.val[1], m20.val[0]);
442 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 1), m20.val[1]);
443 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 2), m20.val[2]);
444 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin2.val[1], vin2.val[2], 3), m21.val[0]);
445 out.val[1] = vmlaq_f32(out.val[1], vin2.val[2], m21.val[1]);
447 out.val[1] = vmlaq_f32(out.val[1], vin3.val[1], m30.val[0]);
448 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 1), m30.val[1]);
449 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 2), m30.val[2]);
450 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin3.val[1], vin3.val[2], 3), m31.val[0]);
451 out.val[1] = vmlaq_f32(out.val[1], vin3.val[2], m31.val[1]);
453 out.val[1] = vmlaq_f32(out.val[1], vin4.val[1], m40.val[0]);
454 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 1), m40.val[1]);
455 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 2), m40.val[2]);
456 out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vin4.val[1], vin4.val[2], 3), m41.val[0]);
457 out.val[1] = vmlaq_f32(out.val[1], vin4.val[2], m41.val[1]);
463 inline float32x4x2_t convolve_5x5<2>(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
464 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4)
466 float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
467 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
468 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
469 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
474 inline float32x4x2_t convolve_5x5<3>(
const float *in_0,
const float *in_1,
const float *in_2,
const float *in_3,
const float *in_4,
475 const float *m0,
const float *m1,
const float *m2,
const float *m3,
const float *m4)
477 float32x4x2_t out = convolve_5x5<1>(in_0, in_1, in_2, in_3, in_4, m0, m1, m2, m3, m4);
478 out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
482 template <
typename T1,
typename T2,
unsigned int str
idex>
486 static void convolve(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
487 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
490 const int input_stride_x =
src->info()->strides_in_bytes().x();
493 const int output_stride_y =
dst->info()->strides_in_bytes().y();
494 const int output_stride_z =
dst->info()->strides_in_bytes().z();
495 const int kernel_stride_x = weights->info()->strides_in_bytes().x();
496 const int kernel_stride_y = weights->info()->strides_in_bytes().y();
497 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
498 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
499 const int output_w =
dst->info()->dimension(0);
500 const int output_h =
dst->info()->dimension(1);
501 const int num_planes_z = window.z().end() - window.z().start();
503 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
509 Window window_out = window;
512 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
515 Window window_in = window;
517 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
518 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
519 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
523 Iterator out(
dst, window_out);
524 Iterator in(
src, window_in);
525 Iterator k(weights, window_k);
527 const uint8_t *k_ptr = k.ptr();
532 uint8_t *out_ptr = out.ptr();
548 for(
int oz = 0; oz < num_planes_z; ++oz)
550 const int zoffset =
id.z() + oz;
551 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
554 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
555 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
556 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
560 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
565 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
566 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
567 in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
569 convolve_3x3<false>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
574 for(
int p = 1; p < kernel_depth; ++p)
576 const uint8_t *ptr_k_base = k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w;
578 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(ptr_k_base);
579 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y);
580 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(ptr_k_base + kernel_stride_y * 2);
584 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
586 auto in_top = reinterpret_cast<const T1 *>(input_base + (ih + 0) *
input_stride_y);
587 auto in_mid = reinterpret_cast<const T1 *>(input_base + (ih + 1) *
input_stride_y);
588 auto in_low = reinterpret_cast<const T1 *>(input_base + (ih + 2) *
input_stride_y);
589 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
590 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
591 in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration)
593 convolve_3x3<true>(in_top, in_mid, in_low, p_out, vk_r0, vk_r1, vk_r2, stridex);
603 template <
typename T1,
typename T2,
unsigned int str
idex>
607 static void convolve(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
608 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
611 const int input_stride_x =
src->info()->strides_in_bytes().x();
614 const int output_stride_y =
dst->info()->strides_in_bytes().y();
615 const int output_stride_z =
dst->info()->strides_in_bytes().z();
616 const int kernel_stride_x = weights->info()->strides_in_bytes().x();
617 const int kernel_stride_y = weights->info()->strides_in_bytes().y();
618 const int kernel_stride_z = weights->info()->strides_in_bytes().z();
619 const int kernel_stride_w = weights->info()->strides_in_bytes()[3];
620 const int output_w =
dst->info()->dimension(0);
621 const int output_h =
dst->info()->dimension(1);
622 const int num_planes_z = window.z().end() - window.z().start();
624 const int kernel_depth = weights->info()->dimension(
Window::DimZ);
630 Window window_out = window;
633 window_out.set(
Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z));
636 Window window_in = window;
638 window_in.set(
Window::DimX, Window::Dimension(0, 0, 0));
639 window_in.set(
Window::DimY, Window::Dimension(0, 0, 0));
640 window_in.set(
Window::DimZ, Window::Dimension(0, 0, 0));
644 Iterator out(
dst, window_out);
645 Iterator in(
src, window_in);
646 Iterator k(weights, window_k);
648 const uint8_t *k_ptr = k.ptr();
653 uint8_t *out_ptr = out.ptr();
656 for(
int oz = 0; oz < num_planes_z; ++oz)
658 const int zoffset =
id.z() + oz;
659 uint8_t *p_out_base = out_ptr + oz * output_stride_z;
662 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
663 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
664 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
665 const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
666 const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + 0 * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
667 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
674 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
675 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
676 in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
678 auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
679 store_results<stridex>(p_out, vres);
684 for(
int p = 1; p < kernel_depth; ++p)
686 const auto ptr_k_r0 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x);
687 const auto ptr_k_r1 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x);
688 const auto ptr_k_r2 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x);
689 const auto ptr_k_r3 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 3 * kernel_stride_y + 0 * kernel_stride_x);
690 const auto ptr_k_r4 = reinterpret_cast<const T1 *>(k_ptr + p * kernel_stride_z + zoffset * kernel_stride_w + 4 * kernel_stride_y + 0 * kernel_stride_x);
692 for(ih = 0, oh = 0; oh < output_h; ++oh, ih +=
conv_stride_y)
699 auto p_out = reinterpret_cast<T2 *>(p_out_base + oh * output_stride_y);
700 for(
int ow = 0; ow < output_w; ow += num_elems_written_per_iteration,
701 in_0 += delta_input, in_1 += delta_input, in_2 += delta_input, in_3 += delta_input, in_4 += delta_input, p_out += num_elems_written_per_iteration)
703 auto vres = convolve_5x5<stridex>(in_0, in_1, in_2, in_3, in_4, ptr_k_r0, ptr_k_r1, ptr_k_r2, ptr_k_r3, ptr_k_r4);
704 accumulate_results<stridex>(p_out, vres);
714 float vreduce(
const float32x4_t &v)
725 template <
typename T1,
typename T2>
726 inline void convolve_1x1(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
727 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
733 convolver_1x1<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
736 convolver_1x1<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
739 convolver_1x1<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
747 inline void convolve_1x1<float, float>(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
748 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
751 if(run_optim_small_tensor(
src))
756 convolver_w1x1_i8x8_f32<1>::convolve(window,
src, weights,
dst,
conv_info);
759 convolver_w1x1_i8x8_f32<2>::convolve(window,
src, weights,
dst,
conv_info);
762 convolver_w1x1_i8x8_f32<3>::convolve(window,
src, weights,
dst,
conv_info);
773 convolver_1x1<float, float, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
776 convolver_1x1<float, float, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
779 convolver_1x1<float, float, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
787 template <
typename T1,
typename T2>
788 inline void convolve_3x3(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
789 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
795 convolver_3x3<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
798 convolver_3x3<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
801 convolver_3x3<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
808 template <
typename T1,
typename T2>
809 inline void convolve_5x5(
const Window &window,
unsigned int num_elems_read_per_iteration,
unsigned int num_elems_written_per_iteration,
810 const ITensor *
src,
const ITensor *weights, ITensor *
dst,
const PadStrideInfo &
conv_info)
816 convolver_5x5<T1, T2, 1>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
819 convolver_5x5<T1, T2, 2>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
822 convolver_5x5<T1, T2, 3>::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration,
src, weights,
dst,
conv_info);
850 if(
dst->total_size() != 0)
863 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *
src, ITensorInfo *weights, ITensorInfo *
dst,
const PadStrideInfo &
conv_info,
unsigned int &num_weight_elems_read_per_row,
864 unsigned int &num_elems_read_per_iteration,
unsigned int &num_elems_written_per_iteration, BorderSize &border_size)
872 unsigned int kernel_size = weights->dimension(width_idx);
878 bool window_changed =
false;
886 switch(
src->data_type())
888 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 890 num_elems_written_per_iteration = 8;
894 if(run_optim_small_tensor_info(
src))
896 num_elems_written_per_iteration = 8;
900 num_elems_written_per_iteration = 4;
907 num_weight_elems_read_per_row = kernel_size;
908 num_elems_read_per_iteration =
conv_stride_x * num_elems_written_per_iteration;
912 switch(
src->data_type())
915 num_weight_elems_read_per_row = 4 + kernel_size - 1;
916 num_elems_read_per_iteration = 12;
919 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 921 num_weight_elems_read_per_row = 8 + kernel_size - 1;
922 num_elems_read_per_iteration = 24;
933 switch(
src->data_type())
936 num_weight_elems_read_per_row = 4 + kernel_size - 1;
937 num_elems_read_per_iteration = 12;
954 int start_x = kernel_size / 2 - static_cast<int>(
conv_info.pad_left());
961 const unsigned int conv_pad_right = std::max(upper_bound_w, 0);
962 const unsigned int conv_pad_bottom =
conv_info.pad_bottom();
966 border_size.right = conv_pad_right;
967 border_size.bottom = conv_pad_bottom;
973 num_elems_read_per_iteration, kernel_size,
975 AccessWindowStatic weights_access(weights, 0, 0, num_weight_elems_read_per_row, kernel_size);
976 AccessWindowHorizontal output_access(
dst, 0, num_elems_written_per_iteration);
978 output_access.set_valid_region(win, ValidRegion(Coordinates(),
dst->tensor_shape()));
987 return std::make_pair(err, win);
990 bool have_zero_x_internal_padding(ITensorInfo *
src, ITensorInfo *weights)
992 return (
src->padding().left == 0 && weights->padding().left == 0 &&
src->padding().right == 0 && weights->padding().right == 0);
997 template <
typename T>
998 void CpuDirectConvolutionKernel::convolve_nhwc_optimized(
const Window &window,
const ITensor *
src,
const ITensor *weights, ITensor *
dst)
1003 using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1005 using tag_type =
typename vtype::tag_type;
1008 const int element_size =
src->info()->element_size();
1009 const int input_stride_w =
src->info()->strides_in_bytes().y() / element_size;
1010 const int input_stride_h =
src->info()->strides_in_bytes().z() / element_size;
1011 const int input_stride_n =
src->info()->strides_in_bytes()[3] / element_size;
1012 const int input_dim_w =
src->info()->dimension(1);
1013 const int input_dim_h =
src->info()->dimension(2);
1015 const int output_stride_c =
dst->info()->strides_in_bytes().x();
1017 const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
1018 const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
1019 const int kernel_dim_w = weights->info()->dimension(1);
1020 const int kernel_dim_h = weights->info()->dimension(2);
1024 const int conv_stride_w = std::get<0>(_conv_info.stride());
1025 const int conv_stride_h = std::get<1>(_conv_info.stride());
1028 Window window_out = window;
1029 window_out.set(
Window::DimX, Window::Dimension(0, 1, 1));
1034 window_w.set(
Window::DimY, Window::Dimension(0, 1, 1));
1035 window_w.set(
Window::DimZ, Window::Dimension(0, 1, 1));
1037 Iterator out(
dst, window_out);
1038 Iterator wei(weights, window_w);
1040 constexpr
int num_elems_read_per_iteration = 16 /
sizeof(T);
1062 const int in_w_start_t = static_cast<int>(
id.y()) * conv_stride_w -
conv_pad_left;
1063 const int in_h_start_t = static_cast<int>(
id.z()) * conv_stride_h -
conv_pad_top;
1064 const int in_w_end_t = in_w_start_t + kernel_dim_w;
1065 const int in_h_end_t = in_h_start_t + kernel_dim_h;
1068 const int in_w_start = std::max(in_w_start_t, 0);
1069 const int in_h_start = std::max(in_h_start_t, 0);
1070 const int in_w_end = std::min(in_w_end_t, input_dim_w);
1071 const int in_h_end = std::min(in_h_end_t, input_dim_h);
1074 const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w;
1075 const int index_h_start = in_h_start - in_h_start_t;
1076 const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w;
1077 const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1086 const T *in_ptr_row = reinterpret_cast<const T *>(
src->buffer() +
src->info()->offset_first_element_in_bytes())
1087 +
id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h;
1088 const T *weights_ptr_row = reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h;
1089 uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1091 T out_temp = static_cast<T>(0);
1092 for(
int index_h = index_h_start; index_h < index_h_end; ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h)
1094 const T *in_ptr_mover = in_ptr_row;
1095 int index_wc = index_wc_start;
1096 vector_type out_temp_vec =
wrapper::vdup_n(static_cast<T>(0), tag_type());
1097 for(; index_wc <= index_wc_end - num_elems_read_per_iteration; index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration)
1103 out_temp += vreduce(out_temp_vec);
1104 for(; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover)
1106 const auto src_val = *(in_ptr_mover);
1107 const auto w_val = *(weights_ptr_row + index_wc);
1108 out_temp += src_val * w_val;
1111 *(reinterpret_cast<T *>(out_ptr)) = out_temp;
1118 template <
typename T>
1119 void CpuDirectConvolutionKernel::convolve_nhwc(
const Window &window,
const ITensor *
src,
const ITensor *weights, ITensor *
dst)
1122 using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
1124 using tag_type =
typename vtype::tag_type;
1127 const int element_size =
src->info()->element_size();
1128 const int input_stride_w =
src->info()->strides_in_bytes().y() / element_size;
1129 const int input_stride_h =
src->info()->strides_in_bytes().z() / element_size;
1130 const int input_stride_n =
src->info()->strides_in_bytes()[3] / element_size;
1131 const int input_dim_w =
src->info()->dimension(1);
1132 const int input_dim_h =
src->info()->dimension(2);
1134 const int output_stride_c =
dst->info()->strides_in_bytes().x();
1136 const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size;
1137 const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size;
1138 const int kernel_dim_w = weights->info()->dimension(1);
1139 const int kernel_dim_h = weights->info()->dimension(2);
1143 const int conv_stride_w = std::get<0>(_conv_info.stride());
1144 const int conv_stride_h = std::get<1>(_conv_info.stride());
1147 Window window_out = window;
1148 window_out.set(
Window::DimX, Window::Dimension(0, 1, 1));
1153 window_w.set(
Window::DimY, Window::Dimension(0, 1, 1));
1154 window_w.set(
Window::DimZ, Window::Dimension(0, 1, 1));
1156 Iterator out(
dst, window_out);
1157 Iterator wei(weights, window_w);
1159 constexpr
int num_elems_read_per_iteration = 16 /
sizeof(T);
1164 const int in_w_start_t = static_cast<int>(
id.y()) * conv_stride_w -
conv_pad_left;
1165 const int in_h_start_t = static_cast<int>(
id.z()) * conv_stride_h -
conv_pad_top;
1166 const int in_w_end_t = in_w_start_t + kernel_dim_w;
1167 const int in_h_end_t = in_h_start_t + kernel_dim_h;
1170 const int in_w_start = std::max(in_w_start_t, 0);
1171 const int in_h_start = std::max(in_h_start_t, 0);
1172 const int in_w_end = std::min(in_w_end_t, input_dim_w);
1173 const int in_h_end = std::min(in_h_end_t, input_dim_h);
1176 const int wei_w_start = in_w_start - in_w_start_t;
1177 const int wei_h_start = in_h_start - in_h_start_t;
1178 const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end);
1179 const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end);
1181 const int index_c_end = weights->info()->dimension(0);
1182 const T *
const in_ptr_start = reinterpret_cast<const T *>(
src->buffer() +
src->info()->offset_first_element_in_bytes()) +
id[3] * input_stride_n;
1186 const T *
const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr());
1187 uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c;
1189 T out_temp = static_cast<T>(0);
1190 for(
int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; ++index_wei_h, ++index_in_h)
1192 const T *
const in_ptr_row = in_ptr_start + index_in_h * input_stride_h;
1193 const T *
const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h;
1194 for(
int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; ++index_wei_w, ++index_in_w)
1196 const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w;
1197 const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w;
1199 vector_type out_temp_vec =
wrapper::vdup_n(static_cast<T>(0), tag_type());
1200 for(; index_c <= index_c_end - num_elems_read_per_iteration; index_c += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration, weights_ptr_mover += num_elems_read_per_iteration)
1206 out_temp += vreduce(out_temp_vec);
1207 for(; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover)
1209 const auto src_val = *(in_ptr_mover);
1210 const auto w_val = *(weights_ptr_mover);
1211 out_temp += src_val * w_val;
1215 *(reinterpret_cast<T *>(out_ptr)) = out_temp;
1224 return _border_size;
1232 _data_layout =
src->data_layout();
1237 const unsigned int conv_pad_right =
conv_info.pad_right();
1238 const unsigned int conv_pad_bottom =
conv_info.pad_bottom();
1260 auto win_config = validate_and_configure_window(
src, weights,
dst,
conv_info, _num_weight_elems_read_per_row,
1261 _num_elems_read_per_iteration, _num_elems_written_per_iteration, _border_size);
1263 ICpuKernel::configure(win_config.second);
1268 unsigned int num_weight_elems_read_per_row = 0;
1269 unsigned int num_elems_read_per_iteration = 0;
1270 unsigned int num_elems_written_per_iteration = 0;
1274 weights->
clone().get(),
1277 num_weight_elems_read_per_row,
1278 num_elems_read_per_iteration,
1279 num_elems_written_per_iteration,
1303 switch(
src->info()->data_type())
1306 convolve_1x1<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration,
src, weights,
dst, _conv_info);
1308 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1310 convolve_1x1<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration,
src, weights,
dst, _conv_info);
1321 switch(
src->info()->data_type())
1324 convolve_3x3<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration,
src, weights,
dst, _conv_info);
1326 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1328 convolve_3x3<float16_t, float16_t>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration,
src, weights,
dst, _conv_info);
1339 switch(
src->info()->data_type())
1342 convolve_5x5<float, float>(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration,
src, weights,
dst, _conv_info);
1359 switch(
src->info()->data_type())
1363 if(have_zero_x_internal_padding(
src->info(), weights->info()))
1365 convolve_nhwc_optimized<float>(window,
src, weights,
dst);
1369 convolve_nhwc<float>(window,
src, weights,
dst);
1381 return "CpuDirectConvolutionLayerKernel";
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
const size_t conv_pad_left
Container for 2D border size.
TensorShape compute_deep_convolution_shape(const ITensorInfo &input, const ITensorInfo &weights, PadStrideInfo conv_info)
Calculate the deep convolution shape output shape of a tensor.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
const size_t conv_stride_x
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
const DataLayout data_layout
Store the tensor's metadata.
float16x8_t vaddq_f16(float16x8_t, float16x8_t)
#define ARM_COMPUTE_ERROR_THROW_ON(status)
const size_t conv_stride_y
int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
const size_t input_stride_y
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
uint8x8_t vgetlow(const uint8x16_t val)
Padding and stride information class.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Num samples, channels, height, width.
uint8x8_t vgethigh(const uint8x16_t val)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Information about executing thread and CPU.
#define ARM_COMPUTE_CREATE_ERROR(error_code, msg)
Creates an error with a given message.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Num samples, height, width, channels.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
const size_t input_stride_z
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
float32x4x3_t load_matrix_row(const float *ptr)
const size_t conv_pad_top
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
DataType
Available data types.
DataLayout
[DataLayout enum definition]
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Status validate(const ITensorInfo *scores_in, const ITensorInfo *boxes_in, const ITensorInfo *batch_splits_in, const ITensorInfo *scores_out, const ITensorInfo *boxes_out, const ITensorInfo *classes, const ITensorInfo *batch_splits_out, const ITensorInfo *keeps, const ITensorInfo *keeps_size, const BoxNMSLimitInfo info)