Compute Library
 21.05
arm_compute::detail Namespace Reference

Data Structures

struct  brelu
 Bounded RELU activation object. More...
 
class  compare_dimension
 Function to compare two Dimensions objects and throw an error on mismatch. More...
 
struct  dummy
 Dummy activation object. More...
 
struct  get_tensor_info_t
 Get the info for a tensor, dummy struct. More...
 
struct  get_tensor_info_t< ITensorInfo * >
 Get the info for a tensor. More...
 
struct  Header
 
struct  linear
 Linear activation object. More...
 
struct  logistic
 Logistic activation object. More...
 
struct  lubrelu
 Lower-Upper Bounded RELU activation object. More...
 
struct  relu
 RELU activation object. More...
 
struct  square
 Square activation object. More...
 

Enumerations

enum  ObjectType : uint32_t {
  Context = 1, Queue = 2, Tensor = 3, TensorPack = 4,
  Operator = 5, Invalid = 0x56DEAD78
}
 < Object type enumerations More...
 

Functions

template<typename T >
bool have_different_dimensions (const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
 
template<typename F >
arm_compute::Status for_each_error (F &&)
 
template<typename F , typename T , typename... Ts>
arm_compute::Status for_each_error (F &&func, T &&arg, Ts &&... args)
 
StatusCode validate_internal_context (const IContext *ctx)
 Check if an internal context is valid. More...
 
StatusCode validate_internal_queue (const IQueue *queue)
 Check if an internal queue is valid. More...
 
StatusCode validate_internal_tensor (const ITensorV2 *tensor)
 Check if an internal tensor is valid. More...
 
StatusCode validate_internal_pack (const TensorPack *pack)
 Check if an internal TensorPack is valid. More...
 
TensorInfo convert_to_legacy_tensor_info (const AclTensorDescriptor &desc)
 Convert a descriptor to a legacy format one. More...
 
AclTensorDescriptor convert_to_descriptor (const TensorInfo &info)
 Convert a legacy tensor meta-data to a descriptor. More...
 
float32x4x3_t load_matrix_row (const float *ptr)
 
template<unsigned int stridex>
float32x4x2_t convolve_3x3 (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<>
float32x4x2_t convolve_3x3< 1 > (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<>
float32x4x2_t convolve_3x3< 2 > (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<>
float32x4x2_t convolve_3x3< 3 > (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)
 
template<unsigned int stridex>
void store_results (float *buffer, const float32x4x2_t &values)
 Stores a float32x4x2_t array into a memory location. More...
 
template<>
void store_results< 1 > (float *buffer, const float32x4x2_t &values)
 
template<>
void store_results< 2 > (float *buffer, const float32x4x2_t &values)
 
template<>
void store_results< 3 > (float *buffer, const float32x4x2_t &values)
 
template<unsigned int stridex>
int get_input_num_elems_processed (unsigned int num_elems_written_per_iteration)
 
template<>
int get_input_num_elems_processed< 1 > (unsigned int num_elems_written_per_iteration)
 
template<>
int get_input_num_elems_processed< 2 > (unsigned int num_elems_written_per_iteration)
 
template<>
int get_input_num_elems_processed< 3 > (unsigned int num_elems_written_per_iteration)
 
float32x4x3_t load_matrix_row (const float *ptr, int weights_offset=0)
 Loads a 3x3 matrix as a row (float). More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4x3_t load_matrix_row (const T *ptr, int weights_offset=0)
 Loads a 3x3 matrix as a row (uint8_t/int8_t). More...
 
template<unsigned int stridex>
void store_results (int32_t *buffer, const int32x4x2_t &values)
 Stores a uint32_t array into a memory location. More...
 
template<>
void store_results< 1 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void store_results< 2 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void store_results< 3 > (int32_t *buffer, const int32x4x2_t &values)
 
template<unsigned int stridex>
void accumulate_results (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 1 > (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 2 > (float *buffer, const float32x4x2_t &values)
 
template<>
void accumulate_results< 3 > (float *buffer, const float32x4x2_t &values)
 
template<unsigned int stridex>
void accumulate_results (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 1 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 2 > (int32_t *buffer, const int32x4x2_t &values)
 
template<>
void accumulate_results< 3 > (int32_t *buffer, const int32x4x2_t &values)
 
float32x4_t single_convolve_3x3_dilation (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, int input_offset)
 Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...
 
float32x4x2_t convolve_3x3_dilation (const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset=0)
 Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. More...
 
template<bool accumulate>
void convolve_3x3 (const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, unsigned int stridex, int input_offset=0)
 Perform a convolve3x3 on float32. More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4_t single_convolve_3x3_dilation (const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
 Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...
 
template<typename T , ARM_COMPUTE_REQUIRES_TA(std::is_same< T, uint8_t >::value||std::is_same< T, int8_t >::value) >
int32x4x2_t convolve_3x3_dilation (const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, const size_t dilation_x, unsigned int stridex, int input_offset)
 Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. More...
 
template<bool accumulate, typename T1 , typename T2 , ARM_COMPUTE_REQUIRES_TA(std::is_same< T1, uint8_t >::value||std::is_same< T1, int8_t >::value) >
void convolve_3x3 (const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, unsigned int stridex, int32_t input_offset)
 Perform a convolve3x3 on 8-bit elements. More...
 
int get_input_num_elems_processed (unsigned int num_elems_written_per_iteration, unsigned int stridex)
 __ARM_FEATURE_FP16_VECTOR_ARITHMETIC More...
 

Enumeration Type Documentation

◆ ObjectType

enum ObjectType : uint32_t
strong

< Object type enumerations

Enumerator
Context 
Queue 
Tensor 
TensorPack 
Operator 
Invalid 

Definition at line 37 of file Object.h.

Function Documentation

◆ accumulate_results() [1/2]

void arm_compute::detail::accumulate_results ( float *  buffer,
const float32x4x2_t &  values 
)
inline

◆ accumulate_results() [2/2]

void arm_compute::detail::accumulate_results ( int32_t *  buffer,
const int32x4x2_t &  values 
)

◆ accumulate_results< 1 >() [1/2]

void arm_compute::detail::accumulate_results< 1 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 144 of file NEDirectConvolutionDetail.h.

145 {
146  vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
147  vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1]));
148 }

Referenced by convolve_3x3().

◆ accumulate_results< 1 >() [2/2]

void arm_compute::detail::accumulate_results< 1 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 166 of file NEDirectConvolutionDetail.h.

167 {
168  vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
169  vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1]));
170 }

◆ accumulate_results< 2 >() [1/2]

void arm_compute::detail::accumulate_results< 2 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 151 of file NEDirectConvolutionDetail.h.

152 {
153  vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0]));
154 }

Referenced by convolve_3x3().

◆ accumulate_results< 2 >() [2/2]

void arm_compute::detail::accumulate_results< 2 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 173 of file NEDirectConvolutionDetail.h.

174 {
175  vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0]));
176 }

◆ accumulate_results< 3 >() [1/2]

void arm_compute::detail::accumulate_results< 3 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 157 of file NEDirectConvolutionDetail.h.

158 {
159  vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0])));
160 }

Referenced by convolve_3x3().

◆ accumulate_results< 3 >() [2/2]

void arm_compute::detail::accumulate_results< 3 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 179 of file NEDirectConvolutionDetail.h.

180 {
181  vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0])));
182 }

◆ convert_to_descriptor()

AclTensorDescriptor convert_to_descriptor ( const TensorInfo info)

Convert a legacy tensor meta-data to a descriptor.

Parameters
[in]infoLegacy tensor meta-data
Returns
A converted descriptor

Definition at line 97 of file LegacySupport.cpp.

98 {
99  const auto num_dims = info.num_dimensions();
101  {
102  static_cast<int32_t>(num_dims),
103  create_tensor_shape_array(info),
104  convert_to_c_data_type(info.data_type()),
105  nullptr,
106  0
107  };
108  return desc;
109 }
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)

References arm_compute::test::validation::info.

Referenced by ITensorV2::get_descriptor().

◆ convert_to_legacy_tensor_info()

TensorInfo convert_to_legacy_tensor_info ( const AclTensorDescriptor desc)

Convert a descriptor to a legacy format one.

Parameters
[in]descDescriptor to convert
Returns
Legacy tensor meta-data

Definition at line 90 of file LegacySupport.cpp.

91 {
92  TensorInfo legacy_desc;
93  legacy_desc.init(create_legacy_tensor_shape(desc.ndims, desc.shape), 1, convert_to_legacy_data_type(desc.data_type));
94  return legacy_desc;
95 }
int32_t ndims
Number or dimensions.
Definition: AclTypes.h:191
AclDataType data_type
Tensor Data type.
Definition: AclTypes.h:193
int32_t * shape
Tensor Shape.
Definition: AclTypes.h:192

References AclTensorDescriptor::data_type, TensorInfo::init(), AclTensorDescriptor::ndims, and AclTensorDescriptor::shape.

Referenced by ClTensor::ClTensor(), and CpuTensor::CpuTensor().

◆ convolve_3x3() [1/3]

float32x4x2_t arm_compute::detail::convolve_3x3 ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)

◆ convolve_3x3() [2/3]

void convolve_3x3 ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
float *  out_ptr,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2,
unsigned int  stridex,
int  input_offset = 0 
)
inline

Perform a convolve3x3 on float32.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[out]out_ptrPointer to the output.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]stridexStride value in elements across x.
[in]input_offset(Optional) Input quantization offset.

Definition at line 352 of file NEDirectConvolutionDetail.h.

355 {
356  ARM_COMPUTE_UNUSED(input_offset);
357  ARM_COMPUTE_ERROR_ON(stridex > 3);
358 
359  float32x4x2_t out =
360  {
361  {
362  vdupq_n_f32(0.f),
363  vdupq_n_f32(0.f)
364  }
365  };
366  if(stridex == 2)
367  {
368  const float32x4x2_t vtop = vld2q_f32(in_top);
369  const float32x4x2_t vmid = vld2q_f32(in_mid);
370  const float32x4x2_t vlow = vld2q_f32(in_low);
371  const float32x4_t vtop_end = vld1q_f32(in_top + 8);
372  const float32x4_t vmid_end = vld1q_f32(in_mid + 8);
373  const float32x4_t vlow_end = vld1q_f32(in_low + 8);
374 
375  out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
376 
377  out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]);
378  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]);
379 
380  out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
381  out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]);
382  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]);
383 
384  out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
385  out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]);
386  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]);
387 
388  accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
389  }
390  else
391  {
392  const float32x4x3_t vtop =
393  {
394  {
395  vld1q_f32(in_top),
396  vld1q_f32(in_top + 4),
397  vld1q_f32(in_top + 8)
398  }
399  };
400  const float32x4x3_t vmid =
401  {
402  {
403  vld1q_f32(in_mid),
404  vld1q_f32(in_mid + 4),
405  vld1q_f32(in_mid + 8)
406  }
407  };
408  const float32x4x3_t vlow =
409  {
410  {
411  vld1q_f32(in_low),
412  vld1q_f32(in_low + 4),
413  vld1q_f32(in_low + 8)
414  }
415  };
416  out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]);
417  out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]);
418 
419  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
420  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
421 
422  out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
423  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
424  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
425 
426  out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
427  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
428  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
429 
430  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
431  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
432 
433  out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
434  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
435  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
436 
437  out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
438  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
439  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
440 
441  if(stridex == 3)
442  {
443  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
444  accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
445  }
446  else
447  {
448  accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
449  }
450  }
451 }
void accumulate_results< 1 >(int32_t *buffer, const int32x4x2_t &values)
void accumulate_results< 3 >(int32_t *buffer, const int32x4x2_t &values)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
SimpleTensor< T2 > accumulate(const SimpleTensor< T1 > &src, DataType output_data_type)
Definition: Accumulate.cpp:38
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void store_results(int32_t *buffer, const int32x4x2_t &values)
Stores a uint32_t array into a memory location.
void accumulate_results< 2 >(int32_t *buffer, const int32x4x2_t &values)

References arm_compute::test::validation::reference::accumulate(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_UNUSED, store_results< 1 >(), store_results< 2 >(), and store_results< 3 >().

◆ convolve_3x3() [3/3]

void arm_compute::detail::convolve_3x3 ( const T1 *  in_top,
const T1 *  in_mid,
const T1 *  in_low,
T2 *  out_ptr,
const int32x4x3_t &  m0,
const int32x4x3_t &  m1,
const int32x4x3_t &  m2,
unsigned int  stridex,
int32_t  input_offset 
)

Perform a convolve3x3 on 8-bit elements.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[out]out_ptrPointer to the output.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]stridexStride value in elements across x.
[in]input_offsetInput quantization offset.

Definition at line 593 of file NEDirectConvolutionDetail.h.

596 {
597  ARM_COMPUTE_ERROR_ON(stridex > 3);
598  using VectorType = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type;
599  using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
600 
601  const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
602 
603  const VectorType vtop =
604  {
605  {
606  wrapper::vload(in_top),
607  wrapper::vload(in_top + 8)
608  }
609  };
610  const VectorType vmid =
611  {
612  {
613  wrapper::vload(in_mid),
614  wrapper::vload(in_mid + 8)
615  }
616  };
617  const VectorType vlow =
618  {
619  {
620  wrapper::vload(in_low),
621  wrapper::vload(in_low + 8)
622  }
623  };
624 
625  const int32x4x3_t vtop_s32 =
626  {
627  {
631  }
632  };
633  const int32x4x3_t vmid_s32 =
634  {
635  {
639  }
640  };
641  const int32x4x3_t vlow_s32 =
642  {
643  {
647  }
648  };
649 
650  int32x4x2_t out
651  {
652  {
653  wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
654  wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}),
655  }
656  };
657 
658  // 0
659  out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]);
660  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]);
661  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]);
662 
663  out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]);
664  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]);
665  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]);
666 
667  out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]);
668  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]);
669  out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]);
670 
671  // 1
672  out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]);
673  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]);
674  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]);
675 
676  out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]);
677  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]);
678  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]);
679 
680  out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]);
681  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]);
682  out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]);
683 
684  if(stridex == 1)
685  {
686  accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out);
687  }
688  else if(stridex == 2)
689  {
690  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
691  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
692  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
693 
694  accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out);
695  }
696  else if(stridex == 3)
697  {
698  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
699  accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out);
700  }
701 }
void accumulate_results< 1 >(int32_t *buffer, const int32x4x2_t &values)
void accumulate_results< 3 >(int32_t *buffer, const int32x4x2_t &values)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
uint8x8_t vext_2(uint8x8_t value_a, uint8x8_t value_b)
Definition: ext.h:40
decltype(strategy::transforms) typedef type
SimpleTensor< T2 > accumulate(const SimpleTensor< T1 > &src, DataType output_data_type)
Definition: Accumulate.cpp:38
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
uint8x8_t vext_1(uint8x8_t value_a, uint8x8_t value_b)
Definition: ext.h:39
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
Definition: setlane.h:91
uint8x8_t vgethigh(const uint8x16_t val)
Definition: gethigh.h:39
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
Definition: add.h:107
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
void store_results(int32_t *buffer, const int32x4x2_t &values)
Stores a uint32_t array into a memory location.
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39
void accumulate_results< 2 >(int32_t *buffer, const int32x4x2_t &values)

References arm_compute::test::validation::reference::accumulate(), accumulate_results< 1 >(), accumulate_results< 2 >(), accumulate_results< 3 >(), ARM_COMPUTE_ERROR_ON, store_results< 1 >(), store_results< 2 >(), store_results< 3 >(), type, arm_compute::wrapper::vaddw(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vext_1(), arm_compute::wrapper::vext_2(), arm_compute::wrapper::vgethigh(), arm_compute::wrapper::vgetlane(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vload(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vreinterpret(), and arm_compute::wrapper::vsetlane().

◆ convolve_3x3< 1 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 1 > ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)
inline

Definition at line 51 of file NEDirectConvolution3x3.h.

52 {
53  const float32x4x3_t vtop =
54  {
55  {
56  vld1q_f32(in_top),
57  vld1q_f32(in_top + 4),
58  vld1q_f32(in_top + 8)
59  }
60  };
61  const float32x4x3_t vmid =
62  {
63  {
64  vld1q_f32(in_mid),
65  vld1q_f32(in_mid + 4),
66  vld1q_f32(in_mid + 8)
67  }
68  };
69  const float32x4x3_t vlow =
70  {
71  {
72  vld1q_f32(in_low),
73  vld1q_f32(in_low + 4),
74  vld1q_f32(in_low + 8)
75  }
76  };
77  float32x4x2_t out =
78  {
79  {
80  vmulq_f32(vtop.val[0], m0.val[0]),
81  vmulq_f32(vtop.val[1], m0.val[0])
82  }
83  };
84  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]);
85  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]);
86 
87  out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]);
88  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]);
89  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]);
90 
91  out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]);
92  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]);
93  out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]);
94 
95  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]);
96  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]);
97 
98  out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]);
99  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]);
100  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]);
101 
102  out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]);
103  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]);
104  out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]);
105  return out;
106 }

Referenced by convolve_3x3< 2 >(), and convolve_3x3< 3 >().

◆ convolve_3x3< 2 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 2 > ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)
inline

Definition at line 109 of file NEDirectConvolution3x3.h.

110 {
111  float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
112  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
113  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
114  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
115  return out;
116 }
float32x4x2_t convolve_3x3< 1 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

References convolve_3x3< 1 >().

◆ convolve_3x3< 3 >()

float32x4x2_t arm_compute::detail::convolve_3x3< 3 > ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2 
)
inline

Definition at line 119 of file NEDirectConvolution3x3.h.

120 {
121  float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
122  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
123  return out;
124 }
float32x4x2_t convolve_3x3< 1 >(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2)

References convolve_3x3< 1 >().

◆ convolve_3x3_dilation() [1/2]

float32x4x2_t arm_compute::detail::convolve_3x3_dilation ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2,
const size_t  dilation_x,
unsigned int  stridex,
int  input_offset = 0 
)
inline

Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]stridexStride value in elements across x.
[in]input_offset(Optional) Input quantization offset.

Definition at line 306 of file NEDirectConvolutionDetail.h.

309 {
310  ARM_COMPUTE_ERROR_ON(stridex > 3);
311  float32x4x2_t out =
312  {
313  {
314  single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
315  single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
316  }
317  };
318 
319  if(stridex == 2)
320  {
321  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1);
322  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2);
323  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3);
324  }
325  else if(stridex == 3)
326  {
327  out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1);
328  }
329 
330  return out;
331 }
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation....

References ARM_COMPUTE_ERROR_ON, and single_convolve_3x3_dilation().

◆ convolve_3x3_dilation() [2/2]

int32x4x2_t arm_compute::detail::convolve_3x3_dilation ( const T *  in_top,
const T *  in_mid,
const T *  in_low,
const int32x4x3_t &  m0,
const int32x4x3_t &  m1,
const int32x4x3_t &  m2,
const size_t  dilation_x,
unsigned int  stridex,
int  input_offset 
)
inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]stridexStride value in elements across x.
[in]input_offsetInput quantization offset.

Definition at line 554 of file NEDirectConvolutionDetail.h.

556 {
557  ARM_COMPUTE_ERROR_ON(stridex > 3);
558  int32x4x2_t out =
559  {
560  {
561  single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset),
562  single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset)
563  }
564  };
565 
566  if(stridex == 2)
567  {
568  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1);
569  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2);
570  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3);
571  }
572  else if(stridex == 3)
573  {
574  out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1);
575  }
576  return out;
577 }
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, size_t dilation_x, int32_t input_offset)
Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation....
uint8x8_t vsetlane(const uint8_t value, const uint8x8_t vector, const unsigned int lane)
Definition: setlane.h:91

References ARM_COMPUTE_ERROR_ON, single_convolve_3x3_dilation(), arm_compute::wrapper::vgetlane(), and arm_compute::wrapper::vsetlane().

◆ for_each_error() [1/2]

arm_compute::Status arm_compute::detail::for_each_error ( F &&  )
inline

Definition at line 104 of file Validate.h.

105 {
106  return arm_compute::Status{};
107 }
Status class.
Definition: Error.h:52

Referenced by arm_compute::error_on_mismatching_dimensions(), and for_each_error().

◆ for_each_error() [2/2]

arm_compute::Status arm_compute::detail::for_each_error ( F &&  func,
T &&  arg,
Ts &&...  args 
)
inline

Definition at line 110 of file Validate.h.

111 {
114  return arm_compute::Status{};
115 }
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
Status class.
Definition: Error.h:52
arm_compute::Status for_each_error(F &&func, T &&arg, Ts &&... args)
Definition: Validate.h:110
FloorUKernelPtr func

References GemmTuner::args, ARM_COMPUTE_RETURN_ON_ERROR, for_each_error(), and func.

◆ get_input_num_elems_processed() [1/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int  num_elems_written_per_iteration)

◆ get_input_num_elems_processed() [2/2]

int arm_compute::detail::get_input_num_elems_processed ( unsigned int  num_elems_written_per_iteration,
unsigned int  stridex 
)
inline

__ARM_FEATURE_FP16_VECTOR_ARITHMETIC

Get the number of elements processed on 3x3 convolution.

Parameters
[in]num_elems_written_per_iterationNumber of elements written per iteration on 3x3 convolution.
[in]stridexStride value in elements across x.
Returns
The number of elements processed.

Definition at line 947 of file NEDirectConvolutionDetail.h.

948 {
949  switch(stridex)
950  {
951  case 1:
952  return num_elems_written_per_iteration;
953  case 2:
954  return num_elems_written_per_iteration << 1;
955  case 3:
956  return num_elems_written_per_iteration * 3;
957  default:
958  ARM_COMPUTE_ERROR("stridex not supported");
959  return 0;
960  }
961 }
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352

References ARM_COMPUTE_ERROR.

◆ get_input_num_elems_processed< 1 >()

int arm_compute::detail::get_input_num_elems_processed< 1 > ( unsigned int  num_elems_written_per_iteration)

Definition at line 152 of file NEDirectConvolution3x3.h.

153 {
154  return num_elems_written_per_iteration;
155 }

◆ get_input_num_elems_processed< 2 >()

int arm_compute::detail::get_input_num_elems_processed< 2 > ( unsigned int  num_elems_written_per_iteration)

Definition at line 158 of file NEDirectConvolution3x3.h.

159 {
160  return num_elems_written_per_iteration << 1;
161 }

◆ get_input_num_elems_processed< 3 >()

int arm_compute::detail::get_input_num_elems_processed< 3 > ( unsigned int  num_elems_written_per_iteration)

Definition at line 164 of file NEDirectConvolution3x3.h.

165 {
166  return num_elems_written_per_iteration * 3;
167 }

◆ have_different_dimensions()

bool arm_compute::detail::have_different_dimensions ( const Dimensions< T > &  dim1,
const Dimensions< T > &  dim2,
unsigned int  upper_dim 
)
inline

Definition at line 47 of file Validate.h.

48 {
49  for(unsigned int i = upper_dim; i < arm_compute::Dimensions<T>::num_max_dimensions; ++i)
50  {
51  if(dim1[i] != dim2[i])
52  {
53  return true;
54  }
55  }
56 
57  return false;
58 }

Referenced by arm_compute::error_on_mismatching_shapes(), arm_compute::error_on_tensors_not_even(), arm_compute::error_on_tensors_not_subsampled(), compare_dimension< T >::operator()(), and NELogicalKernel::validate().

◆ load_matrix_row() [1/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float *  ptr)
inline

Definition at line 34 of file NEDirectConvolution3x3.h.

35 {
36  const float32x4x3_t r =
37  {
38  {
39  vld1q_dup_f32(ptr),
40  vld1q_dup_f32(1 + ptr),
41  vld1q_dup_f32(2 + ptr)
42  }
43  };
44  return r;
45 }

◆ load_matrix_row() [2/3]

float32x4x3_t arm_compute::detail::load_matrix_row ( const float *  ptr,
int  weights_offset = 0 
)
inline

Loads a 3x3 matrix as a row (float).

Parameters
[in]ptrPointer to a float 3x3 matrix.
[in]weights_offset(Optional) Weights quantization offset.
Returns
The loaded matrix.

Definition at line 45 of file NEDirectConvolutionDetail.h.

46 {
47  ARM_COMPUTE_UNUSED(weights_offset);
48  const float32x4x3_t r =
49  {
50  {
51  vld1q_dup_f32(ptr),
52  vld1q_dup_f32(1 + ptr),
53  vld1q_dup_f32(2 + ptr)
54  }
55  };
56  return r;
57 }
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

References ARM_COMPUTE_UNUSED.

◆ load_matrix_row() [3/3]

int32x4x3_t arm_compute::detail::load_matrix_row ( const T *  ptr,
int  weights_offset = 0 
)
inline

Loads a 3x3 matrix as a row (uint8_t/int8_t).

Parameters
[in]ptrPointer to a uint8_t/int8_t 3x3 matrix.
[in]weights_offset(Optional) Weights quantization offset.
Returns
The loaded matrix.

Definition at line 67 of file NEDirectConvolutionDetail.h.

68 {
69  const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset);
70 
71  /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes:
72  r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */
73  int32x4x3_t r =
74  {
75  {
76  vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)),
77  vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))),
78  vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2)))
79  }
80  };
81  return r;
82 }

◆ single_convolve_3x3_dilation() [1/2]

float32x4_t arm_compute::detail::single_convolve_3x3_dilation ( const float *  in_top,
const float *  in_mid,
const float *  in_low,
const float32x4x3_t &  m0,
const float32x4x3_t &  m1,
const float32x4x3_t &  m2,
const size_t  dilation_x,
int  input_offset 
)
inline

Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]input_offset(Optional) Input quantization offset.

Definition at line 248 of file NEDirectConvolutionDetail.h.

251 {
252  ARM_COMPUTE_UNUSED(input_offset);
253 
254  const float32x4x3_t vtop =
255  {
256  {
257  vld1q_f32(in_top),
258  vld1q_f32(in_top + dilation_x),
259  vld1q_f32(in_top + 2 * dilation_x)
260  }
261  };
262  const float32x4x3_t vmid =
263  {
264  {
265  vld1q_f32(in_mid),
266  vld1q_f32(in_mid + dilation_x),
267  vld1q_f32(in_mid + 2 * dilation_x)
268  }
269  };
270  const float32x4x3_t vlow =
271  {
272  {
273  vld1q_f32(in_low),
274  vld1q_f32(in_low + dilation_x),
275  vld1q_f32(in_low + 2 * dilation_x)
276  }
277  };
278  float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]);
279  out = vmlaq_f32(out, vtop.val[1], m0.val[1]);
280  out = vmlaq_f32(out, vtop.val[2], m0.val[2]);
281 
282  out = vmlaq_f32(out, vmid.val[0], m1.val[0]);
283  out = vmlaq_f32(out, vmid.val[1], m1.val[1]);
284  out = vmlaq_f32(out, vmid.val[2], m1.val[2]);
285 
286  out = vmlaq_f32(out, vlow.val[0], m2.val[0]);
287  out = vmlaq_f32(out, vlow.val[1], m2.val[1]);
288  out = vmlaq_f32(out, vlow.val[2], m2.val[2]);
289 
290  return out;
291 }
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152

References ARM_COMPUTE_UNUSED.

Referenced by convolve_3x3_dilation().

◆ single_convolve_3x3_dilation() [2/2]

int32x4_t arm_compute::detail::single_convolve_3x3_dilation ( const T *  in_top,
const T *  in_mid,
const T *  in_low,
const int32x4x3_t &  m0,
const int32x4x3_t &  m1,
const int32x4x3_t &  m2,
size_t  dilation_x,
int32_t  input_offset 
)
inline

Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1.

Parameters
[in]in_topPointer to the first row of the input.
[in]in_midPointer to the second row of the input.
[in]in_lowPointer to the third row of the input.
[in]m0First row of the filter.
[in]m1Second row of the filter.
[in]m2Third row of the filter.
[in]dilation_xDilation, in elements across x.
[in]input_offsetInput quantization offset.

Definition at line 466 of file NEDirectConvolutionDetail.h.

469 {
470  using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type;
471  using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>;
472 
473  const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{});
474 
475  const VectorType vtop =
476  {
477  {
478  wrapper::vload(in_top),
479  wrapper::vload(in_top + dilation_x),
480  wrapper::vload(in_top + 2 * dilation_x)
481  }
482  };
483  const VectorType vmid =
484  {
485  {
486  wrapper::vload(in_mid),
487  wrapper::vload(in_mid + dilation_x),
488  wrapper::vload(in_mid + 2 * dilation_x)
489  }
490  };
491  const VectorType vlow =
492  {
493  {
494  wrapper::vload(in_low),
495  wrapper::vload(in_low + dilation_x),
496  wrapper::vload(in_low + 2 * dilation_x)
497  }
498  };
499 
500  const int32x4x3_t vtop_s32 =
501  {
502  {
506  }
507  };
508  const int32x4x3_t vmid_s32 =
509  {
510  {
514  }
515  };
516  const int32x4x3_t vlow_s32 =
517  {
518  {
522  }
523  };
524 
525  int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]);
526  out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]);
527  out = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]);
528 
529  out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]);
530  out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]);
531  out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]);
532 
533  out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]);
534  out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]);
535  out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]);
536 
537  return out;
538 }
decltype(strategy::transforms) typedef type
int16x4_t vreinterpret(const uint16x4_t &a)
Definition: reinterpret.h:44
uint8x8_t vgetlow(const uint8x16_t val)
Definition: getlow.h:39
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
Definition: add.h:107
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
uint8x8_t vload(const uint8_t *ptr)
Definition: load.h:39
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
uint16x8_t vmovl(const uint8x8_t &a)
Definition: movl.h:39

References type, arm_compute::wrapper::vaddw(), arm_compute::wrapper::vdup_n(), arm_compute::wrapper::vgetlow(), arm_compute::wrapper::vload(), arm_compute::wrapper::vmla(), arm_compute::wrapper::vmovl(), arm_compute::wrapper::vmul(), and arm_compute::wrapper::vreinterpret().

◆ store_results() [1/2]

void arm_compute::detail::store_results ( int32_t *  buffer,
const int32x4x2_t &  values 
)

Stores a uint32_t array into a memory location.

Parameters
[in]bufferPointer to the memory location where the values will be stored.
[in]valuesValues that will be stored.

◆ store_results() [2/2]

void store_results ( float *  buffer,
const float32x4x2_t &  values 
)

Stores a float32x4x2_t array into a memory location.

Parameters
[in]bufferPointer to the memory location where the values will be stored.
[in]valuesValues that will be stored.

◆ store_results< 1 >() [1/2]

void arm_compute::detail::store_results< 1 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 122 of file NEDirectConvolutionDetail.h.

123 {
124  vst1q_s32(buffer, values.val[0]);
125  vst1q_s32(buffer + 4, values.val[1]);
126 }

◆ store_results< 1 >() [2/2]

void store_results< 1 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 130 of file NEDirectConvolution3x3.h.

131 {
132  vst1q_f32(buffer, values.val[0]);
133  vst1q_f32(buffer + 4, values.val[1]);
134 }

Referenced by convolve_3x3().

◆ store_results< 2 >() [1/2]

void arm_compute::detail::store_results< 2 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 129 of file NEDirectConvolutionDetail.h.

130 {
131  vst1q_s32(buffer, values.val[0]);
132 }

◆ store_results< 2 >() [2/2]

void store_results< 2 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 137 of file NEDirectConvolution3x3.h.

138 {
139  vst1q_f32(buffer, values.val[0]);
140 }

Referenced by convolve_3x3().

◆ store_results< 3 >() [1/2]

void arm_compute::detail::store_results< 3 > ( int32_t *  buffer,
const int32x4x2_t &  values 
)
inline

Definition at line 135 of file NEDirectConvolutionDetail.h.

136 {
137  vst1_s32(buffer, vget_low_s32(values.val[0]));
138 }

◆ store_results< 3 >() [2/2]

void store_results< 3 > ( float *  buffer,
const float32x4x2_t &  values 
)
inline

Definition at line 143 of file NEDirectConvolution3x3.h.

144 {
145  vst1_f32(buffer, vget_low_f32(values.val[0]));
146 }

Referenced by convolve_3x3().

◆ validate_internal_context()

StatusCode arm_compute::detail::validate_internal_context ( const IContext ctx)
inline

Check if an internal context is valid.

Parameters
[in]ctxInternal context to check
Returns
A status code

Definition at line 143 of file IContext.h.

144 {
145  if(ctx == nullptr || !ctx->is_valid())
146  {
147  ARM_COMPUTE_LOG_ERROR_ACL("Invalid context object");
148  return StatusCode::InvalidArgument;
149  }
150  return StatusCode::Success;
151 }
#define ARM_COMPUTE_LOG_ERROR_ACL(msg)
Log an error message to the logger.
Definition: Log.h:73

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, IContext::is_valid(), and arm_compute::Success.

Referenced by AclCreateQueue(), AclCreateTensor(), AclCreateTensorPack(), AclDestroyContext(), AclGetClContext(), AclGetClDevice(), and AclSetClContext().

◆ validate_internal_pack()

StatusCode arm_compute::detail::validate_internal_pack ( const TensorPack pack)
inline

Check if an internal TensorPack is valid.

Parameters
[in]packInternal tensor pack to check
Returns
A status code

Definition at line 119 of file TensorPack.h.

120 {
121  if(pack == nullptr || !pack->is_valid())
122  {
123  ARM_COMPUTE_LOG_ERROR_ACL("[TensorPack]: Invalid tensor pack object");
124  return StatusCode::InvalidArgument;
125  }
126  return StatusCode::Success;
127 }
#define ARM_COMPUTE_LOG_ERROR_ACL(msg)
Log an error message to the logger.
Definition: Log.h:73

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, TensorPack::is_valid(), and arm_compute::Success.

Referenced by AclDestroyTensorPack(), AclPackTensor(), and AclPackTensors().

◆ validate_internal_queue()

StatusCode arm_compute::detail::validate_internal_queue ( const IQueue queue)
inline

Check if an internal queue is valid.

Parameters
[in]queueInternal queue to check
Returns
A status code

Definition at line 89 of file IQueue.h.

90 {
91  if(queue == nullptr || !queue->is_valid())
92  {
93  ARM_COMPUTE_LOG_ERROR_ACL("[IQueue]: Invalid queue object");
94  return StatusCode::InvalidArgument;
95  }
96  return StatusCode::Success;
97 }
#define ARM_COMPUTE_LOG_ERROR_ACL(msg)
Log an error message to the logger.
Definition: Log.h:73

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, IQueue::is_valid(), and arm_compute::Success.

Referenced by AclDestroyQueue(), AclGetClQueue(), AclQueueFinish(), and AclSetClQueue().

◆ validate_internal_tensor()

StatusCode arm_compute::detail::validate_internal_tensor ( const ITensorV2 tensor)
inline

Check if an internal tensor is valid.

Parameters
[in]tensorInternal tensor to check
Returns
A status code

Definition at line 129 of file ITensorV2.h.

130 {
131  if(tensor == nullptr || !tensor->is_valid())
132  {
133  ARM_COMPUTE_LOG_ERROR_ACL("[ITensorV2]: Invalid tensor object");
134  return StatusCode::InvalidArgument;
135  }
136  return StatusCode::Success;
137 }
#define ARM_COMPUTE_LOG_ERROR_ACL(msg)
Log an error message to the logger.
Definition: Log.h:73

References ARM_COMPUTE_LOG_ERROR_ACL, arm_compute::InvalidArgument, ITensorV2::is_valid(), and arm_compute::Success.

Referenced by AclDestroyTensor(), AclGetClMem(), AclMapTensor(), AclTensorImport(), and AclUnmapTensor().