Compute Library
 21.02
arm_compute::detail Namespace Reference

Data Structures

struct  brelu
 Bounded RELU activation object. More...
 
class  compare_dimension
 Function to compare two Dimensions objects and throw an error on mismatch. More...
 
struct  dummy
 Dummy activation object. More...
 
struct  get_tensor_info_t
 Get the info for a tensor, dummy struct. More...
 
struct  get_tensor_info_t< ITensorInfo * >
 Get the info for a tensor. More...
 
struct  linear
 Linear activation object. More...
 
struct  logistic
 Logistic activation object. More...
 
struct  lubrelu
 Lower-Upper Bounded RELU activation object. More...
 
struct  relu
 RELU activation object. More...
 
struct  square
 Square activation object. More...
 

Functions

template<typename T >
bool have_different_dimensions (const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
 
template<typename F >
arm_compute::Status for_each_error (F &&)
 
template<typename F , typename T , typename... Ts>
arm_compute::Status for_each_error (F &&func, T &&arg, Ts &&... args)
 
float32x4x3_t load_matrix_row (const float *ptr)
 
template<unsigned int stridex>
float32x4x2_t convolve_3x3 (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<>
float32x4x2_t convolve_3x3< 1 > (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<>
float32x4x2_t convolve_3x3< 2 > (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<>
float32x4x2_t convolve_3x3< 3 > (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<unsigned int stridex>
void store_results (float *buffer, const float32x4x2_t &values)
 Stores a float32x4x2_t array into a memory location. More...
 
template<>
void store_results< 1 > (float *buffer, const float32x4x2_t &values)
 
template<>
void store_results< 2 > (float *buffer, const float32x4x2_t &values)
 
template<>
void store_results< 3 > (float *buffer, const float32x4x2_t &values)
 
template<unsigned int stridex>
int get_input_num_elems_processed (unsigned int num_elems_written_per_iteration)
 
template<>
int get_input_num_elems_processed< 1 > (unsigned int num_elems_written_per_iteration)
 
template<>
int get_input_num_elems_processed< 2 > (unsigned int num_elems_written_per_iteration)
 
template<>
int get_input_num_elems_processed< 3 > (unsigned int num_elems_written_per_iteration)
 
float32x4x3_t load_matrix_row (const float *ptr, int weights_offset=0)
 Loads a 3x3 matrix as a row (float). More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4x3_t load_matrix_row (const T *ptr, int weights_offset=0)
 Loads a 3x3 matrix as a row (uint8_t/int8_t). More...
 
template<unsigned int stridex>
void store_results (int32_t *buffer, const int32x4x2_t &values)
 Stores a uint32_t array into a memory location. More...
 
template<>
void store_results< 1 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void store_results< 2 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void store_results< 3 > (int32_t *buffer, const int32x4x2_t &values)
 
template<unsigned int stridex>
void accumulate_results (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 1 > (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 2 > (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 3 > (float *buffer, const float32x4x2_t &values)
 
template<unsigned int stridex>
void accumulate_results (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 1 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 2 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 3 > (int32_t *buffer, const int32x4x2_t &values)
 
float32x4_t single_convolve_3x3_dilation (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, int input_offset)
 Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...
 
float32x4x2_t convolve_3x3_dilation (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
 Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...
 
template<bool accumulate>
void convolve_3x3 (const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, unsigned int stridex, int input_offset=0)
 Perform a convolve3x3 on float32. More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4_t single_convolve_3x3_dilation (const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
 Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4x2_t convolve_3x3_dilation (const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset)
 Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...
 
template<bool accumulate, typename T1 , typename T2 , ARM_COMPUTE_REQUIRES_TA(std::is_same< T1, uint8_t >::value||std::is_same< T1, int8_t >::value) >
void convolve_3x3 (const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, unsigned int stridex, int32_t input_offset)
 Perform a convolve3x3 on 8-bit elements. More...
 
int get_input_num_elems_processed (unsigned int num_elems_written_per_iteration, unsigned int stridex)
 __ARM_FEATURE_FP16_VECTOR_ARITHMETIC More...
 

Function Documentation

◆ accumulate_results() [1/2]

void arm_compute::detail::accumulate_results ( float *  buffer,
const float32x4x2_t &  values 
)
inline

◆ accumulate_results() [2/2]

void arm_compute::detail::accumulate_results ( int32_t *  buffer,
const int32x4x2_t &  values 
)

◆ accumulate_results< 1 >() [1/2]

void arm_compute::detail::accumulate_results< 1 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 145 of file NEDirectConvolutionDetail.h.

Referenced by accumulate_results< 3 >(), and convolve_3x3().

146 {
147  vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
148  vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
149 }

◆ accumulate_results< 1 >() [2/2]

void arm_compute::detail::accumulate_results< 1 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 167 of file NEDirectConvolutionDetail.h.

168 {
169  vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
170  vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
171 }

◆ accumulate_results< 2 >() [1/2]

void arm_compute::detail::accumulate_results< 2 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 152 of file NEDirectConvolutionDetail.h.

Referenced by accumulate_results< 3 >(), and convolve_3x3().

153 {
154  vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
155 }

◆ accumulate_results< 2 >() [2/2]

void arm_compute::detail::accumulate_results< 2 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 174 of file NEDirectConvolutionDetail.h.

175 {
176  vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
177 }

◆ accumulate_results< 3 >() [1/2]

void arm_compute::detail::accumulate_results< 3 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 158 of file NEDirectConvolutionDetail.h.

References accumulate_results().

Referenced by accumulate_results< 3 >(), and convolve_3x3().

159 {
160  vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
161 }

◆ accumulate_results< 3 >() [2/2]

void arm_compute::detail::accumulate_results< 3 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 180 of file NEDirectConvolutionDetail.h.

References accumulate_results(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), store_results(), store_results< 1 >(), store_results< 2 >(), store_results< 3 >(), vadd_f16(), and vaddq_f16().

181 {
182  vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
183 }

◆ convolve_3x3() [1/3]

float32x4x2_t arm_compute::detail::convolve_3x3 ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)

◆ convolve_3x3() [2/3]

void convolve_3x3 ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
float *  out_ptr,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2,
unsigned int  stridex,
int  input_offset = 0 
)
inline

Perform a convolve3x3 on float32.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[out]out_ptrPointer to the output.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]stridexStride value in elements across x.
[in]input_offset(Optional) Input quantization offset.

Definition at line 353 of file NEDirectConvolutionDetail.h.

References accumulate(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_UNUSED, store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

356 {
357  ARM_COMPUTE_UNUSED(input_offset);
358  ARM_COMPUTE_ERROR_ON(stridex > 3);
359 
360  float32x4x2_t out =
361  {
362  {
363  vdupq_n_f32(0.f),
364  vdupq_n_f32(0.f)
365  }
366  };
367  if(stridex == 2)
368  {
369  const float32x4x2_t vtop = vld2q_f32(in_top);
370  const float32x4x2_t vmid = vld2q_f32(in_mid);
371  const float32x4x2_t vlow = vld2q_f32(in_low);
372  const float32x4_t vtop_end = vld1q_f32(in_top + 8);
373  const float32x4_t vmid_end = vld1q_f32(in_mid + 8);
374  const float32x4_t vlow_end = vld1q_f32(in_low + 8);
375 
376  out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
377 
378  out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
379  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
380 
381  out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
382  out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
383  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
384 
385  out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
386  out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
387  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
388 
389  accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
390  }
391  else
392  {
393  const float32x4x3_t vtop =
394  {
395  {
396  vld1q_f32(in_top),
397  vld1q_f32(in_top + 4),
398  vld1q_f32(in_top + 8)
399  }
400  };
401  const float32x4x3_t vmid =
402  {
403  {
404  vld1q_f32(in_mid),
405  vld1q_f32(in_mid + 4),
406  vld1q_f32(in_mid + 8)
407  }
408  };
409  const float32x4x3_t vlow =
410  {
411  {
412  vld1q_f32(in_low),
413  vld1q_f32(in_low + 4),
414  vld1q_f32(in_low + 8)
415  }
416  };
417  out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
418  out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
419 
420  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
421  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
422 
423  out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
424  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
425  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
426 
427  out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
428  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
429  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
430 
431  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
432  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
433 
434  out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
435  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
436  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
437 
438  out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
439  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
440  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
441 
442  if(stridex == 3)
443  {
444  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
445  accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
446  }
447  else
448  {
449  accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
450  }
451  }
452 }
void accumulate_results< 1 >(int32_t *buffer, const int32x4x2_t &values)
void store_results< 3 >(int32_t *buffer, const int32x4x2_t &values)
void accumulate_results< 3 >(int32_t *buffer, const int32x4x2_t &values)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
void store_results< 1 >(int32_t *buffer, const int32x4x2_t &values)
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void store_results< 2 >(int32_t *buffer, const int32x4x2_t &values)
__kernel void accumulate(__global uchar *input_ptr, uint input_stride_x, uint input_step_x, uint input_stride_y, uint input_step_y, uint input_offset_first_element_in_bytes, __global uchar *accu_ptr, uint accu_stride_x, uint accu_step_x, uint accu_stride_y, uint accu_step_y, uint accu_offset_first_element_in_bytes)
This function accumulates an input image into output image.
Definition: accumulate.cl:41
void accumulate_results< 2 >(int32_t *buffer, const int32x4x2_t &values)

◆ convolve_3x3() [3/3]

void arm_compute::detail::convolve_3x3 ( const T1 *  in_top,
const T1 *  in_mid,
const T1 *  in_low,
T2 *  out_ptr,
const int32x4x3_t &  m0,
const int32x4x3_t &  m1,
const int32x4x3_t &  m2,
unsigned int  stridex,
int32_t  input_offset 
)

Perform a convolve3x3 on 8-bit elements.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[out]out_ptrPointer to the output.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]stridexStride value in elements across x.
[in]input_offsetInput quantization offset.

Definition at line 594 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR_ON.

597 {
598  ARM_COMPUTE_ERROR_ON(stridex > 3);
599  using VectorType = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
600  using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
601 
602  const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
603 
604  const VectorType vtop =
605  {
606  {
607  wrapper::vload(in_top),
608  wrapper::vload(in_top + 8)
609  }
610  };
611  const VectorType vmid =
612  {
613  {
614  wrapper::vload(in_mid),
615  wrapper::vload(in_mid + 8)
616  }
617  };
618  const VectorType vlow =
619  {
620  {
621  wrapper::vload(in_low),
622  wrapper::vload(in_low + 8)
623  }
624  };
625 
626  const int32x4x3_t vtop_s32 =
627  {
628  {
632  }
633  };
634  const int32x4x3_t vmid_s32 =
635  {
636  {
640  }
641  };
642  const int32x4x3_t vlow_s32 =
643  {
644  {
648  }
649  };
650 
651  int32x4x2_t out
652  {
653  {
654  wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
655  wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
656  }
657  };
658 
659  // 0
660  out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
661  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
662  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
663 
664  out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
665  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
666  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
667 
668  out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
669  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
670  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
671 
672  // 1
673  out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
674  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
675  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
676 
677  out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
678  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
679  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
680 
681  out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
682  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
683  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
684 
685  if(stridex == 1)
686  {
687  accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
688  }
689  else if(stridex == 2)
690  {
691  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
692  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
693  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
694 
695  accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
696  }
697  else if(stridex == 3)
698  {
699  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
700  accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
701  }
702 }
void accumulate_results< 1 >(int32_t *buffer, const int32x4x2_t &values)
void store_results< 3 >(int32_t *buffer, const int32x4x2_t &values)
void accumulate_results< 3 >(int32_t *buffer, const int32x4x2_t &values)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
uint8x8_t vext_2(uint8x8_t value_a, uint8x8_t value_b)
Definition: ext.h:40
decltype(strategy::transforms) typedef type
void store_results< 1 >(int32_t *buffer, const int32x4x2_t &values)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x8_t vext_1(uint8x8_t value_a, uint8x8_t value_b)
Definition: ext.h:39
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
Definition: setlane.h:91
void store_results< 2 >(int32_t *buffer, const int32x4x2_t &values)
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
Definition: add.h:107
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
__kernel void accumulate(__global uchar *input_ptr, uint input_stride_x, uint input_step_x, uint input_stride_y, uint input_step_y, uint input_offset_first_element_in_bytes, __global uchar *accu_ptr, uint accu_stride_x, uint accu_step_x, uint accu_stride_y, uint accu_step_y, uint accu_offset_first_element_in_bytes)
This function accumulates an input image into output image.
Definition: accumulate.cl:41
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
void accumulate_results< 2 >(int32_t *buffer, const int32x4x2_t &values)

◆ convolve_3x3< 1 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 1 > ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)
inline

Definition at line 51 of file NEDirectConvolution3x3.h.

Referenced by convolve_3x3< 2 >(), and convolve_3x3< 3 >().

52 {
53  const float32x4x3_t vtop =
54  {
55  {
56  vld1q_f32(in_top),
57  vld1q_f32(in_top + 4),
58  vld1q_f32(in_top + 8)
59  }
60  };
61  const float32x4x3_t vmid =
62  {
63  {
64  vld1q_f32(in_mid),
65  vld1q_f32(in_mid + 4),
66  vld1q_f32(in_mid + 8)
67  }
68  };
69  const float32x4x3_t vlow =
70  {
71  {
72  vld1q_f32(in_low),
73  vld1q_f32(in_low + 4),
74  vld1q_f32(in_low + 8)
75  }
76  };
77  float32x4x2_t out =
78  {
79  {
80  vmulq_f32(vtop.val[0], m0.val[0]),
81  vmulq_f32(vtop.val[1], m0.val[0])
82  }
83  };
84  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
85  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
86 
87  out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
88  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
89  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
90 
91  out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
92  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
93  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
94 
95  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
96  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
97 
98  out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
99  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
100  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
101 
102  out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
103  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
104  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
105  return out;
106 }

◆ convolve_3x3< 2 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 2 > ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)
inline

Definition at line 109 of file NEDirectConvolution3x3.h.

References convolve_3x3< 1 >().

110 {
111  float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
112  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
113  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
114  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
115  return out;
116 }
float32x4x2_t convolve_3x3< 1 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

◆ convolve_3x3< 3 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 3 > ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)
inline

Definition at line 119 of file NEDirectConvolution3x3.h.

References convolve_3x3< 1 >(), and store_results().

120 {
121  float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
122  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
123  return out;
124 }
float32x4x2_t convolve_3x3< 1 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

◆ convolve_3x3_dilation() [1/2]

float32x4x2_t arm_compute::detail::convolve_3x3_dilation ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2,
const size_t  dilation_x,
unsigned int  stridex,
int  input_offset = 0 
)
inline

Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]stridexStride value in elements across x.
[in]input_offset(Optional) Input quantization offset.

Definition at line 307 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR_ON, convolve_3x3(), and single_convolve_3x3_dilation().

310 {
311  ARM_COMPUTE_ERROR_ON(stridex > 3);
312  float32x4x2_t out =
313  {
314  {
315  single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
316  single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
317  }
318  };
319 
320  if(stridex == 2)
321  {
322  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
323  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
324  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
325  }
326  else if(stridex == 3)
327  {
328  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
329  }
330 
331  return out;
332 }
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

◆ convolve_3x3_dilation() [2/2]

int32x4x2_t arm_compute::detail::convolve_3x3_dilation ( const T *  in_top,
const T *  in_mid,
const T *  in_low,
const int32x4x3_t &  m0,
const int32x4x3_t &  m1,
const int32x4x3_t &  m2,
const size_t  dilation_x,
unsigned int  stridex,
int  input_offset 
)
inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]stridexStride value in elements across x.
[in]input_offsetInput quantization offset.

Definition at line 555 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR_ON, single_convolve_3x3_dilation(), arm_compute::wrapper::vgetlane(), and arm_compute::wrapper::vsetlane().

557 {
558  ARM_COMPUTE_ERROR_ON(stridex > 3);
559  int32x4x2_t out =
560  {
561  {
562  single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
563  single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
564  }
565  };
566 
567  if(stridex == 2)
568  {
569  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
570  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
571  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
572  }
573  else if(stridex == 3)
574  {
575  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
576  }
577  return out;
578 }
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
Definition: setlane.h:91

◆ for_each_error() [1/2]

arm_compute::Status arm_compute::detail::for_each_error ( F &&  )
inline

Definition at line 108 of file Validate.h.

Referenced by arm_compute::error_on_mismatching_dimensions(), and for_each_error().

109 {
110  return arm_compute::Status{};
111 }
Status class.
Definition: Error.h:52

◆ for_each_error() [2/2]

arm_compute::Status arm_compute::detail::for_each_error ( F &&  func,
T &&  arg,
Ts &&...  args 
)
inline

Definition at line 114 of file Validate.h.

References GemmTuner::args, ARM_COMPUTE_RETURN_ON_ERROR, for_each_error(), and func.

115 {
118  return arm_compute::Status{};
119 }
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
Status class.
Definition: Error.h:52
arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
Definition: Validate.h:114
FloorUKernelPtr func

◆ get_input_num_elems_processed() [1/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int  num_elems_written_per_iteration)

Referenced by store_results< 3 >().

◆ get_input_num_elems_processed() [2/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int  num_elems_written_per_iteration,
unsigned int  stridex 
)
inline

__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Get the number of elements processed on 3x3 convolution.

Parameters
[in]num_elems_written_per_iterationNumber of elements written per iteration on 3x3 convolution.
[in]stridexStride value in elements across x.
Returns
The number of elements processed.

Definition at line 948 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_ERROR.

949 {
950  switch(stridex)
951  {
952  case 1:
953  return num_elems_written_per_iteration;
954  case 2:
955  return num_elems_written_per_iteration << 1;
956  case 3:
957  return num_elems_written_per_iteration * 3;
958  default:
959  ARM_COMPUTE_ERROR("stridex not supported");
960  return 0;
961  }
962 }
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

◆ get_input_num_elems_processed< 1 >()

int arm_compute::detail::get_input_num_elems_processed< 1 > ( unsigned int  num_elems_written_per_iteration)

Definition at line 152 of file NEDirectConvolution3x3.h.

153 {
154  return num_elems_written_per_iteration;
155 }

◆ get_input_num_elems_processed< 2 >()

int arm_compute::detail::get_input_num_elems_processed< 2 > ( unsigned int  num_elems_written_per_iteration)

Definition at line 158 of file NEDirectConvolution3x3.h.

159 {
160  return num_elems_written_per_iteration << 1;
161 }

◆ get_input_num_elems_processed< 3 >()

int arm_compute::detail::get_input_num_elems_processed< 3 > ( unsigned int  num_elems_written_per_iteration)

Definition at line 164 of file NEDirectConvolution3x3.h.

165 {
166  return num_elems_written_per_iteration * 3;
167 }

◆ have_different_dimensions()

bool arm_compute::detail::have_different_dimensions ( const Dimensions< T > &  dim1,
const Dimensions< T > &  dim2,
unsigned int  upper_dim 
)
inline

Definition at line 51 of file Validate.h.

Referenced by CLPixelWiseMultiplicationKernel::border_size(), arm_compute::error_on_mismatching_shapes(), arm_compute::error_on_tensors_not_even(), arm_compute::error_on_tensors_not_subsampled(), compare_dimension< T >::operator()(), NEPixelWiseMultiplicationKernel::run_op(), and NELogicalKernel::validate().

52 {
53  for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
54  {
55  if(dim1[i] != dim2[i])
56  {
57  return true;
58  }
59  }
60 
61  return false;
62 }

◆ load_matrix_row() [1/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float *  ptr)
inline

Definition at line 34 of file NEDirectConvolution3x3.h.

References convolve_3x3().

35 {
36  const float32x4x3_t r =
37  {
38  {
39  vld1q_dup_f32(ptr),
40  vld1q_dup_f32(1 + ptr),
41  vld1q_dup_f32(2 + ptr)
42  }
43  };
44  return r;
45 }

◆ load_matrix_row() [2/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float *  ptr,
int  weights_offset = 0 
)
inline

Loads a 3x3 matrix as a row (float).

Parameters
[in]ptrPointer to a float 3x3 matrix.
[in]weights_offset(Optional) Weights quantization offset.
Returns
The loaded matrix.

Definition at line 46 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_UNUSED.

47 {
48  ARM_COMPUTE_UNUSED(weights_offset);
49  const float32x4x3_t r =
50  {
51  {
52  vld1q_dup_f32(ptr),
53  vld1q_dup_f32(1 + ptr),
54  vld1q_dup_f32(2 + ptr)
55  }
56  };
57  return r;
58 }
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

◆ load_matrix_row() [3/3]

int32x4x3_t arm_compute::detail::load_matrix_row ( const T *  ptr,
int  weights_offset = 0 
)
inline

Loads a 3x3 matrix as a row (uint8_t/int8_t).

Parameters
[in]ptrPointer to a uint8_t/int8_t 3x3 matrix.
[in]weights_offset(Optional) Weights quantization offset.
Returns
The loaded matrix.

Definition at line 68 of file NEDirectConvolutionDetail.h.

References store_results(), store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

69 {
70  const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
71 
72  /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
73  r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
74  int32x4x3_t r =
75  {
76  {
77  vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
78  vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
79  vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
80  }
81  };
82  return r;
83 }

◆ single_convolve_3x3_dilation() [1/2]

float32x4_t arm_compute::detail::single_convolve_3x3_dilation ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2,
const size_t  dilation_x,
int  input_offset 
)
inline

Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]input_offset(Optional) Input quantization offset.

Definition at line 249 of file NEDirectConvolutionDetail.h.

References ARM_COMPUTE_UNUSED.

Referenced by convolve_3x3_dilation().

252 {
253  ARM_COMPUTE_UNUSED(input_offset);
254 
255  const float32x4x3_t vtop =
256  {
257  {
258  vld1q_f32(in_top),
259  vld1q_f32(in_top + dilation_x),
260  vld1q_f32(in_top + 2 * dilation_x)
261  }
262  };
263  const float32x4x3_t vmid =
264  {
265  {
266  vld1q_f32(in_mid),
267  vld1q_f32(in_mid + dilation_x),
268  vld1q_f32(in_mid + 2 * dilation_x)
269  }
270  };
271  const float32x4x3_t vlow =
272  {
273  {
274  vld1q_f32(in_low),
275  vld1q_f32(in_low + dilation_x),
276  vld1q_f32(in_low + 2 * dilation_x)
277  }
278  };
279  float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
280  out = vmlaq_f32(out, vtop.val[1], m0.val[1]);
281  out = vmlaq_f32(out, vtop.val[2], m0.val[2]);
282 
283  out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
284  out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
285  out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
286 
287  out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
288  out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
289  out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
290 
291  return out;
292 }
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

◆ single_convolve_3x3_dilation() [2/2]

int32x4_t arm_compute::detail::single_convolve_3x3_dilation ( const T *  in_top,
const T *  in_mid,
const T *  in_low,
const int32x4x3_t &  m0,
const int32x4x3_t &  m1,
const int32x4x3_t &  m2,
size_t  dilation_x,
int32_t  input_offset 
)
inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]input_offsetInput quantization offset.

Definition at line 467 of file NEDirectConvolutionDetail.h.

470 {
471  using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
472  using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
473 
474  const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
475 
476  const VectorType vtop =
477  {
478  {
479  wrapper::vload(in_top),
480  wrapper::vload(in_top + dilation_x),
481  wrapper::vload(in_top + 2 * dilation_x)
482  }
483  };
484  const VectorType vmid =
485  {
486  {
487  wrapper::vload(in_mid),
488  wrapper::vload(in_mid + dilation_x),
489  wrapper::vload(in_mid + 2 * dilation_x)
490  }
491  };
492  const VectorType vlow =
493  {
494  {
495  wrapper::vload(in_low),
496  wrapper::vload(in_low + dilation_x),
497  wrapper::vload(in_low + 2 * dilation_x)
498  }
499  };
500 
501  const int32x4x3_t vtop_s32 =
502  {
503  {
507  }
508  };
509  const int32x4x3_t vmid_s32 =
510  {
511  {
515  }
516  };
517  const int32x4x3_t vlow_s32 =
518  {
519  {
523  }
524  };
525 
526  int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
527  out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
528  out = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
529 
530  out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
531  out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
532  out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
533 
534  out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
535  out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
536  out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
537 
538  return out;
539 }
decltype(strategy::transforms) typedef type
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
Definition: add.h:107
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39

◆ store_results() [1/2]

void arm_compute::detail::store_results ( int32_t *  buffer,
const int32x4x2_t &  values 
)

Stores a uint32_t array into a memory location.

Parameters
[in]bufferPointer to the memory location where the values will be stored.
[in]valuesValues that will be stored.

◆ store_results() [2/2]

void store_results ( float *  buffer,
const float32x4x2_t &  values 
)

Stores a float32x4x2_t array into a memory location.

Parameters
[in]bufferPointer to the memory location where the values will be stored.
[in]valuesValues that will be stored.

Referenced by accumulate_results< 3 >(), NEConvolutionKernel< matrix_size >::configure(), convolve_3x3< 3 >(), load_matrix_row(), NESeparableConvolutionVertKernel< matrix_size >::run(), and NEConvolutionRectangleKernel::run().

◆ store_results< 1 >() [1/2]

void arm_compute::detail::store_results< 1 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 123 of file NEDirectConvolutionDetail.h.

124 {
125  vst1q_s32(buffer, values.val[0]);
126  vst1q_s32(buffer + 4, values.val[1]);
127 }

◆ store_results< 1 >() [2/2]

void store_results< 1 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 130 of file NEDirectConvolution3x3.h.

Referenced by accumulate_results< 3 >(), convolve_3x3(), and load_matrix_row().

131 {
132  vst1q_f32(buffer, values.val[0]);
133  vst1q_f32(buffer + 4, values.val[1]);
134 }

◆ store_results< 2 >() [1/2]

void arm_compute::detail::store_results< 2 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 130 of file NEDirectConvolutionDetail.h.

131 {
132  vst1q_s32(buffer, values.val[0]);
133 }

◆ store_results< 2 >() [2/2]

void store_results< 2 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 137 of file NEDirectConvolution3x3.h.

Referenced by accumulate_results< 3 >(), convolve_3x3(), and load_matrix_row().

138 {
139  vst1q_f32(buffer, values.val[0]);
140 }

◆ store_results< 3 >() [1/2]

void arm_compute::detail::store_results< 3 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 136 of file NEDirectConvolutionDetail.h.

References accumulate_results().

137 {
138  vst1_s32(buffer, vget_low_s32(values.val[0]));
139 }

◆ store_results< 3 >() [2/2]

void store_results< 3 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 143 of file NEDirectConvolution3x3.h.

References get_input_num_elems_processed().

Referenced by accumulate_results< 3 >(), convolve_3x3(), and load_matrix_row().

144 {
145  vst1_f32(buffer, vget_low_f32(values.val[0]));
146 }