Compute Library
 22.05
arm_gemm Namespace Reference

Namespaces

 utils
 

Data Structures

struct  Activation
 
class  barrier
 
struct  ConvolutionParameters
 
class  convolver
 
struct  GemmArgs
 
class  GemmCommon
 
struct  GemmConfig
 
class  GemmHybrid
 
class  GemmHybridIndirect
 
class  GemmHybridQuantized
 
class  GemmHybridQuantizedInline
 
struct  GemmImplementation
 
struct  GemmImplementation< Top, Tret, Nothing >
 
class  GemmInterleaved
 
class  GemmInterleavedPretransposed2d
 
class  GemvBatched
 
class  GemvPretransposed
 
class  IGemmCommon
 
struct  IndirectInputArg
 
struct  IndirectOutputArg
 
struct  KernelDescription
 
class  NDCoordinate
 NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it inherits from NDRange. More...
 
class  NDRange
 
struct  Nothing
 
struct  PerformanceParameters
 
class  QuantizeWrapper
 
struct  Requantize32
 
class  StdTransformsFixed
 
class  StdTransformsSVE
 

Typedefs

using bfloat16 = arm_compute::bfloat16
 
template<typename strategy , typename To , typename Tr , typename OutputStage = Nothing>
using GemmInterleavedNoMerge = GemmInterleaved< strategy, To, Tr, OutputStage, false >
 
template<typename strategy , typename To , typename Tr >
using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved< strategy, To, Tr, Requantize32, false >
 
template<typename strategy , typename To , typename Tr >
using GemmInterleavedQuantized = GemmInterleaved< strategy, To, Tr, Requantize32 >
 
template<typename Top , typename Tret >
using UniqueGemmCommon = std::unique_ptr< GemmCommon< Top, Tret > >
 
using ndrange_t = NDRange< ndrange_max >
 
using ndcoord_t = NDCoordinate< ndrange_max >
 

Enumerations

enum  VLType { None, SVE }
 
enum  GemmMethod {
  DEFAULT, GEMV_BATCHED, GEMV_PRETRANSPOSED, GEMV_NATIVE_TRANSPOSED,
  GEMM_NATIVE, GEMM_HYBRID, GEMM_INTERLEAVED, GEMM_INTERLEAVED_2D,
  QUANTIZE_WRAPPER, QUANTIZE_WRAPPER_2D, GEMM_HYBRID_QUANTIZED
}
 

Functions

template<typename T >
void bias_adder (T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
 
template<bool DoBias, typename T >
void activator (T *out, unsigned int stride, const T *bias, Activation act, unsigned int rows, unsigned int cols)
 
template<>
const GemmImplementation< bfloat16, float > * gemm_implementation_list< bfloat16, float > ()
 
template UniqueGemmCommon< bfloat16, float > gemm< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template bool has_opt_gemm< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template std::vector< KernelDescriptionget_compatible_kernels< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template<>
const GemmImplementation< float, float > * gemm_implementation_list< float, float > ()
 
template UniqueGemmCommon< float, float > gemm< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template bool has_opt_gemm< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template std::vector< KernelDescriptionget_compatible_kernels< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template<typename Top , typename Tret , class OutputStage = Nothing>
const GemmImplementation< Top, Tret, OutputStage > * gemm_implementation_list ()
 
template<typename Top , typename Tret , class OutputStage >
bool find_implementation (const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)
 
template<typename Top , typename Tret , class OutputStage >
std::vector< KernelDescriptionget_compatible_kernels (const GemmArgs &args, const OutputStage &os)
 
template<typename Top , typename Tret , class OutputStage >
bool has_opt_gemm (const GemmArgs &args, const OutputStage &os)
 
template<typename Top , typename Tret , class OutputStage >
UniqueGemmCommon< Top, Tret > gemm (const GemmArgs &args, const OutputStage &os)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn , typename TOut >
void interleave_block (TOut *&out, const TIn *const *in, size_t width, size_t height, size_t row_offset, bool first)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut >
void FixupRowSums (TOut *&out, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void IndirectInterleave (TOut *out, const TIn *const *const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void ConvolutionInterleave (TOut *out, const TIn *in, size_t in_stride, const convolver< TIn > &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void Interleave (TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int twidth, unsigned int height, bool sve = false, typename Tin , typename Tout >
void MergeResults (Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout *bias, Activation act, bool append)
 
template<typename Tin , typename Tout >
void requantize_block_32 (const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
 
template<typename T >
void compute_row_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *row_bias)
 
template<typename T >
void compute_col_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
 
template<typename T >
void row_sums_indirect (unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg< T > A_arg, size_t M, int32_t *output_ptr, const Requantize32 *qp)
 
template<unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt = VLType::None, typename TOut , typename TIn >
void Transform (TOut *out, const TIn *const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)
 
template<typename T >
std::string get_type_name ()
 
template<typename T >
iceildiv (const T a, const T b)
 
template<typename T >
roundup (const T a, const T b)
 
bool quant_no_left_shift (const Requantize32 &qp)
 
bool quant_hybrid_symmetric (const Requantize32 &qp)
 
bool quant_hybrid_asymmetric (const Requantize32 &qp)
 
template<typename Top , typename Tret , class OutputStage = Nothing>
KernelDescription get_gemm_method (const GemmArgs &args, const OutputStage &={})
 
arm_compute::Window to_window (const ndrange_t &ndr)
 
arm_compute::Window to_window (const ndcoord_t &ndc)
 
ndrange_t to_ndrange (const arm_compute::Window &win)
 Convert an arm_compute::Window to an arm_gemm::NDRange of the same max dimensions. More...
 
ndcoord_t to_ndcoord (const arm_compute::Window &win)
 Convert an arm_compute::Window to an arm_gemm::NDCoord of the same max dimensions. More...
 

Variables

std::mutex report_mutex
 
constexpr std::size_t ndrange_max
 

Typedef Documentation

◆ bfloat16

Definition at line 30 of file bfloat.hpp.

◆ GemmInterleavedNoMerge

using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>

Definition at line 1075 of file gemm_interleaved.hpp.

◆ GemmInterleavedPretransposedNoMergeQuantizedInline

◆ GemmInterleavedQuantized

◆ ndcoord_t

typedef NDCoordinate< 6 > ndcoord_t

Definition at line 45 of file arm_gemm_compute_iface.hpp.

◆ ndrange_t

typedef NDRange< 6 > ndrange_t

Definition at line 44 of file arm_gemm_compute_iface.hpp.

◆ UniqueGemmCommon

using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >

Definition at line 174 of file arm_gemm.hpp.

Enumeration Type Documentation

◆ GemmMethod

◆ VLType

enum VLType
strong
Enumerator
None 
SVE 

Definition at line 80 of file utils.hpp.

Function Documentation

◆ activator()

void arm_gemm::activator ( T *  out,
unsigned int  stride,
const T *  bias,
Activation  act,
unsigned int  rows,
unsigned int  cols 
)
inline

Definition at line 40 of file bias_adder.hpp.

References bias_adder(), Activation::BoundedReLU, caffe_mnist_image_extractor::cols, Activation::None, Activation::param1, Activation::ReLU, caffe_mnist_image_extractor::rows, and Activation::type.

40  {
41  if (act.type == Activation::Type::None) {
42  if (DoBias) {
43  bias_adder(out, stride, bias, rows, cols);
44  }
45  return;
46  }
47 
48  if (act.type == Activation::Type::ReLU) {
49  for (unsigned int row=0; row<rows; row++) {
50  for (unsigned int col=0; col<cols; col++) {
51  T &v = out[row * stride + col];
52  if (DoBias) {
53  v += bias[col];
54  }
55  v = std::max(static_cast<T>(0), v);
56  }
57  }
58  }
59 
60  if (act.type == Activation::Type::BoundedReLU) {
61  const T max = static_cast<T>(act.param1);
62 
63  for (unsigned int row=0; row<rows; row++) {
64  for (unsigned int col=0; col<cols; col++) {
65  T &v = out[row * stride + col];
66  if (DoBias) {
67  v += bias[col];
68  }
69  v = std::max(static_cast<T>(0), std::min(v, max));
70  }
71  }
72  }
73 }
void bias_adder(T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
Definition: bias_adder.hpp:31
const int32_t * bias

◆ bias_adder()

void arm_gemm::bias_adder ( T *  out,
unsigned int  stride,
const T *  bias,
unsigned int  rows,
unsigned int  cols 
)
inline

Definition at line 31 of file bias_adder.hpp.

References caffe_mnist_image_extractor::cols, and caffe_mnist_image_extractor::rows.

Referenced by activator(), and GemmHybrid< strategy, To, Tr >::execute().

31  {
32  for (unsigned int row=0; row<rows; row++) {
33  for (unsigned int col=0; col<cols; col++) {
34  out[row * stride + col] += bias[col];
35  }
36  }
37 }
const int32_t * bias

◆ compute_col_sums()

void arm_gemm::compute_col_sums ( const Requantize32 qp,
unsigned int  width,
unsigned int  height,
const T *  input,
unsigned int  in_stride,
int32_t *  col_bias,
unsigned int  depth,
unsigned int  multi,
unsigned int  first_col 
)

◆ compute_row_sums()

void arm_gemm::compute_row_sums ( const Requantize32 qp,
unsigned int  width,
unsigned int  height,
const T *  input,
unsigned int  in_stride,
int32_t *  row_bias 
)

◆ ConvolutionInterleave()

void ConvolutionInterleave ( TOut *  out,
const TIn *  in,
size_t  in_stride,
const convolver< TIn > &  conv,
const unsigned int  rounded_stringlen,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 226 of file interleave_indirect.cpp.

References offset(), convolver< T >::process_columns(), and SVE.

227  {
228  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
229 
230  auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
231 
232  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
233  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
234 
235  for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
236  // How many of the rows are active - the rest will get padded in interleave_block.
237  unsigned int active_height = std::min(ymax - ybase, height);
238  bool first = true;
239 
240  auto conv_rows = conv_cols.process_rows(ybase, active_height);
241 
242  while (!conv_rows.finished()) {
243  unsigned int width, offset;
244 
245  // Get next set of parameters
246  std::tie(width, offset) = conv_rows.next_block(row_ptrs);
247 
248  // Perform the interleave
249  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
250  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
251  } else {
252  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
253  }
254 
255  first=false;
256  }
257 
258  if (std::is_integral<TOut>::value && integrate_sums) {
259  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
260  }
261  }
262 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1083

◆ find_implementation()

bool arm_gemm::find_implementation ( const GemmArgs args,
const OutputStage &  os,
const GemmImplementation< Top, Tret, OutputStage > *&  impl 
)

Definition at line 166 of file gemm_implementation.hpp.

References GemmArgs::_cfg, DEFAULT, GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), and GemmImplementation< Top, Tret, OutputStage >::method.

Referenced by get_compatible_kernels().

166  {
167  auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
168  const GemmConfig *cfg = args._cfg;
169 
170  const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
171  uint64_t best_estimate = 0;
172 
173  for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
174  /* Skip if this implementation doesn't support these args. */
175  if (!i->do_is_supported(args, os)) {
176  continue;
177  }
178 
179  /* Skip if a specific method is requested and this is a different one. */
180  if (cfg && cfg->method != GemmMethod::DEFAULT && i->method != cfg->method) {
181  continue;
182  }
183 
184  /* Skip if a filter is to be applied and it doesn't match. */
185  if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
186  continue;
187  }
188 
189  /* Test the cycle estimate */
190  uint64_t estimate = i->do_cycle_estimate(args, os);
191 
192  /* Short circuit - if the estimate is zero, return this one immediately. */
193  if (estimate==0) {
194  impl=i;
195  return true;
196  }
197 
198  /* Otherwise, remember this is our best so far if we don't yet have
199  * a valid candidate, or we beat the estimate. */
200  if ((saved_impl == nullptr) || (estimate < best_estimate)) {
201  saved_impl = i;
202  best_estimate = estimate;
203  }
204  }
205 
206  /* Return whichever method gave the best estimate. */
207  if (saved_impl != nullptr) {
208  impl = saved_impl;
209  return true;
210  }
211 
212  return false;
213 }

◆ FixupRowSums()

void arm_gemm::FixupRowSums ( TOut *&  out,
const int32_t  row_sum_multiplier 
)
inline

Definition at line 125 of file interleave_indirect.cpp.

References SVE.

125  {
126  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
127 
128  // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
129  if (row_sum_multiplier) {
130  // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
131  // next block (post sums).
132  // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
133  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
134 
135  out_int32 -= height;
136  for (unsigned int i=0; i<height; i++) {
137  out_int32[i] *= row_sum_multiplier;
138  }
139  } else {
140  // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
141  // sum block. We need to insert the (zero) sums, and advance 'out'.
142  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
143 
144  for (unsigned int i=0; i<height; i++) {
145  out_int32[i] = 0;
146  }
147 
148  out_int32 += height;
149 
150  out = reinterpret_cast<TOut *>(out_int32);
151  }
152 }

◆ gemm()

UniqueGemmCommon< Top, Tret > gemm ( const GemmArgs args,
const OutputStage &  os 
)

Definition at line 245 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::do_instantiate().

Referenced by arm_compute::test::validation::DATA_TEST_CASE(), and arm_compute::test::validation::TEST_CASE().

245  {
246  const GemmImplementation<Top, Tret, OutputStage> *impl;
247 
248  if (find_implementation<Top, Tret, OutputStage>(args, os, impl)) {
249  return UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os));
250  }
251 
252  return UniqueGemmCommon<Top, Tret>(nullptr);
253 }

◆ gemm< bfloat16, float, Nothing >()

template UniqueGemmCommon<bfloat16, float> arm_gemm::gemm< bfloat16, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ gemm< float, float, Nothing >()

template UniqueGemmCommon<float, float> arm_gemm::gemm< float, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ gemm_implementation_list()

const GemmImplementation<Top, Tret, OutputStage>* arm_gemm::gemm_implementation_list ( )

◆ gemm_implementation_list< bfloat16, float >()

◆ gemm_implementation_list< float, float >()

const GemmImplementation<float, float>* arm_gemm::gemm_implementation_list< float, float > ( )

Definition at line 229 of file gemm_fp32.cpp.

References GemmTuner::args, gemm< float, float, Nothing >(), get_compatible_kernels< float, float, Nothing >(), and has_opt_gemm< float, float, Nothing >().

229  {
230  return gemm_fp32_methods;
231 }

◆ get_compatible_kernels()

std::vector< KernelDescription > get_compatible_kernels ( const GemmArgs args,
const OutputStage &  os 
)

Definition at line 216 of file gemm_implementation.hpp.

References DEFAULT, GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), find_implementation(), and GemmImplementation< Top, Tret, OutputStage >::method.

216  {
217  std::vector<KernelDescription> res;
218 
219  /* Find out what the default implementation in so we can set the flag accordingly later. */
220  const GemmImplementation<Top, Tret, OutputStage> *default_impl;
221  find_implementation(args, os, default_impl);
222 
223  auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
224 
225  for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
226  /* Check that this implementation supports the presented problem. */
227 
228  if (!i->do_is_supported(args, os)) {
229  continue;
230  }
231 
232  res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
233  }
234 
235  return res;
236 }
bool find_implementation(const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)

◆ get_compatible_kernels< bfloat16, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< bfloat16, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ get_compatible_kernels< float, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< float, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ get_gemm_method()

KernelDescription arm_gemm::get_gemm_method ( const GemmArgs args,
const OutputStage &  = {} 
)

◆ get_type_name()

std::string arm_gemm::get_type_name ( )

Definition at line 42 of file utils.hpp.

42  {
43 #ifdef __GNUC__
44  std::string s = __PRETTY_FUNCTION__;
45 
46  auto start = s.find("cls_");
47 
48  if (start==std::string::npos) {
49  return "(unknown)";
50  }
51 
52  for(size_t x = start+4; x<s.size(); x++) {
53  if (s[x] == ';' || s[x] == ']') {
54  return s.substr(start+4, x-(start+4));
55  }
56  }
57 
58  return "(unknown)";
59 #else
60  return "(unsupported)";
61 #endif
62 }

◆ has_opt_gemm()

bool has_opt_gemm ( const GemmArgs args,
const OutputStage &  os 
)

Definition at line 239 of file gemm_implementation.hpp.

References GemmTuner::args.

239  {
240  const GemmImplementation<Top, Tret, OutputStage> *impl;
241  return find_implementation<Top, Tret, OutputStage>(args, os, impl);
242 }

◆ has_opt_gemm< bfloat16, float, Nothing >()

template bool arm_gemm::has_opt_gemm< bfloat16, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ has_opt_gemm< float, float, Nothing >()

template bool arm_gemm::has_opt_gemm< float, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ iceildiv()

T arm_gemm::iceildiv ( const T  a,
const T  b 
)
inline

◆ IndirectInterleave()

void IndirectInterleave ( TOut *  out,
const TIn *const *const *  ptr,
unsigned int  stringlen,
unsigned int  rounded_stringlen,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 155 of file interleave_indirect.cpp.

References SVE.

158  {
159  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
160 
161  // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
162  // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
163  // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
164  // pointers and conditionally overriding the out of range ones.
165 
166  // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
167  // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
168  // expensive in highly threaded scenarios.
169  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
170 
171  // Figure out the starting position based on k0 (with rounded length)
172  unsigned int start_string = k0 / rounded_stringlen;
173  unsigned int start_stringpos = k0 % rounded_stringlen;
174 
175  // Process blocks of 'height' height...
176  for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
177  // Height to process
178  unsigned int active_height = std::min(ymax - ybase, height);
179 
180  // Track our progress through the various strings
181  unsigned int k_left = (kmax - k0);
182  unsigned int string = start_string;
183  unsigned int stringpos = start_stringpos;
184 
185  bool first = true;
186 
187  // Prepare to call 'interleave_block' above for each string encompassed by K range
188  while (k_left > 0) {
189  // Width to process - and the width we will generate (with padding)
190  unsigned int in_width = std::min(k_left, stringlen - stringpos);
191  unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
192 
193  const TIn * const *row_base = ptr[string] + ybase;
194 
195  // If not all rows are valid, copy the ones that are into local array (see above comment).
196  if (active_height < height) {
197  for (unsigned int i=0; i<active_height; i++) {
198  row_ptrs[i] = ptr[string][ybase + i];
199  }
200 
201  row_base = row_ptrs;
202  }
203 
204  // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
205  // much code. However, integrated sums make no sense for non-integral types and won't ever be
206  // requested. So put a type trait check here to avoid generating pointless code.
207  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
208  interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
209  } else {
210  interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
211  }
212 
213  k_left -= out_width;
214  string++;
215  stringpos=0;
216  first=false;
217  }
218 
219  if (std::is_integral<TOut>::value && integrate_sums) {
220  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
221  }
222  }
223 }

◆ Interleave()

void Interleave ( TOut *  out,
const TIn *  in,
size_t  in_stride,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 265 of file interleave_indirect.cpp.

References SVE.

265  {
266  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
267 
268  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
269  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
270 
271  const unsigned int width=kmax-k0;
272 
273  for (unsigned int y=y0; y<ymax; y+=height) {
274  for (unsigned int r=0; r<height; r++) {
275  row_ptrs[r] = in + ((y + r) * in_stride);
276  }
277 
278  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
279  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280  } else {
281  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
282  }
283 
284  if (std::is_integral<TOut>::value && integrate_sums) {
285  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
286  }
287  }
288 }

◆ interleave_block()

void arm_gemm::interleave_block ( TOut *&  out,
const TIn *const *  in,
size_t  width,
size_t  height,
size_t  row_offset,
bool  first 
)

Definition at line 61 of file interleave_indirect.cpp.

References SVE.

61  {
62  const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
63 
64  std::vector<int32_t> the_sums;
65 
66  if (integrate_sums) {
67  the_sums = std::vector<int32_t>(int_by, 0);
68 
69  if (!first) {
70  // In 'integrate sums' mode, we dump the sums at the end on each pass.
71 
72  // On the last pass this is correct, but on other passes it is not -
73  // so on the subsequent pass we need to take the output written by
74  // the previous pass as starting point for the sums, and then
75  // overwrite them with new interleaved data.
76  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
77 
78  // Rewind pointer to where we wrote out the sums last time.
79  out_int32 -= int_by;
80 
81  // Restore the running sums.
82  memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
83 
84  // Update the "real" pointer so that the next output will clobber the old sums.
85  out = reinterpret_cast<TOut *>(out_int32);
86  }
87  }
88 
89  for (unsigned int pos=0; pos<width; pos+=block) {
90  for (unsigned int row=0; row<int_by; row++) {
91  // Row out of range - pad 'block' entries.
92  if (row >= height) {
93  for (unsigned int col=0; col<block; col++) {
94  *out++ = 0;
95  }
96  continue;
97  }
98 
99  for (unsigned int col=0; col<block; col++) {
100  // Column out of range - pad a single entry
101  if (pos + col >= width) {
102  *out++ = 0;
103  continue;
104  }
105 
106  if (integrate_sums) {
107  the_sums[row] += in[row][row_offset + pos + col];
108  }
109 
110  *out++ = in[row][row_offset + pos + col];
111  }
112  }
113  }
114 
115  if (integrate_sums) {
116  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
117 
118  memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
119 
120  out = reinterpret_cast<TOut *>(out_int32 + int_by);
121  }
122 }

◆ MergeResults()

void MergeResults ( Tout *  out,
const Tin *  in,
int  ldc,
int  y0,
int  ymax,
int  x0,
int  xmax,
const Tout *  bias,
Activation  act,
bool  append 
)

Definition at line 38 of file mergeresults.cpp.

References Activation::BoundedReLU, Activation::None, Activation::param1, Activation::ReLU, and Activation::type.

Referenced by StdTransformsSVE< TOperand, TResult, height, width_vectors, block, mmla, integrate_sums >::Merge().

38  {
39  // NOTE: The following code is disabled to avoid calling get_vector_length(), so templated MergeResults will not
40  // be correct for SVE cases. This is OK as we have specialisations for all needed SVE cases anyway.
41  //
42  // For SVE cases, multiply the width up by the vector length.
43  // Use the *input* type to determine this, since this will be what the kernel operated on.
44  // const int width = twidth * (sve ? get_vector_length<Tin>() : 1);
45  const int width = twidth;
46 
47  const int full_y_blocks = (ymax - y0) / height;
48  const int y_remainder = (ymax - y0) % height;
49  const int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
50 
51  const int full_x_blocks = (xmax - x0) / width;
52  const int x_remainder = (xmax - x0) % width;
53  const int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
54 
55  for (int y_block = 0; y_block < y_blocks; y_block++) {
56  int ybase = y0 + (y_block * height);
57 
58  int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
59 
60  for (int x_block = 0; x_block < x_blocks; x_block++) {
61  int xbase = x0 + (x_block * width);
62 
63  int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
64 
65  for (int row=0; row < fill_rows; row++) {
66  for (int col=0; col < fill_cols; col++) {
67  Tout &r = out[(ybase + row) * ldc + xbase + col];
68  Tout v = in[row * width + col];
69 
70  if (append) {
71  v += r;
72  }
73 
74  if (bias) {
75  v += bias[xbase + col];
76  }
77 
78  switch(act.type) {
79  default:
81  break;
82 
83  case Activation::Type::ReLU:
84  v = std::max(v, static_cast<Tout>(0));
85  break;
86 
87  case Activation::Type::BoundedReLU:
88  v = std::max(std::min(v, static_cast<Tout>(act.param1)), static_cast<Tout>(0));
89  break;
90  }
91 
92  r = v;
93  }
94  }
95 
96  in += (width * height);
97  }
98  }
99 }
const int32_t * bias

◆ quant_hybrid_asymmetric()

bool arm_gemm::quant_hybrid_asymmetric ( const Requantize32 qp)
inline

Definition at line 132 of file utils.hpp.

References Requantize32::per_channel_requant, and quant_no_left_shift().

132  {
133  return quant_no_left_shift(qp) /* && qp.b_offset != 0 */ && qp.per_channel_requant==false;
134 }
bool quant_no_left_shift(const Requantize32 &qp)
Definition: utils.hpp:116

◆ quant_hybrid_symmetric()

bool arm_gemm::quant_hybrid_symmetric ( const Requantize32 qp)
inline

Definition at line 126 of file utils.hpp.

References Requantize32::b_offset, and quant_no_left_shift().

126  {
127  return quant_no_left_shift(qp) && qp.b_offset == 0;
128 }
bool quant_no_left_shift(const Requantize32 &qp)
Definition: utils.hpp:116

◆ quant_no_left_shift()

bool arm_gemm::quant_no_left_shift ( const Requantize32 qp)
inline

Definition at line 116 of file utils.hpp.

References Requantize32::per_channel_left_shifts, Requantize32::per_channel_requant, and Requantize32::per_layer_left_shift.

Referenced by quant_hybrid_asymmetric(), and quant_hybrid_symmetric().

116  {
117  if (qp.per_channel_requant) {
118  return (qp.per_channel_left_shifts == nullptr);
119  } else {
120  return (qp.per_layer_left_shift == 0);
121  }
122 }

◆ requantize_block_32()

void arm_gemm::requantize_block_32 ( const Requantize32 qp,
unsigned int  width,
unsigned int  height,
const Tin *  input,
unsigned int  in_stride,
Tout *  output,
unsigned int  out_stride,
const int32_t *  row_bias,
const int32_t *  col_bias,
unsigned int  start_col 
)

◆ roundup()

T arm_gemm::roundup ( const T  a,
const T  b 
)
inline

Definition at line 70 of file utils.hpp.

References arm_compute::test::validation::b.

Referenced by CpuWinogradConv2d::configure(), CpuWinogradConv2dTransformOutputKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::configure(), GemmHybrid< strategy, To, Tr >::estimate_cycles(), GemmInterleavedPretransposed2d< strategy, To, Tr >::estimate_cycles(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::estimate_cycles(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::estimate_cycles(), PoolingDepthfirstGenericQuantized< strategy >::execute(), GemvPretransposed< strategy, To, Tr, OutputStage >::execute(), GemmHybrid< strategy, To, Tr >::execute(), GemmHybridQuantizedInline< strategy, To, Tr >::execute(), GemmHybridQuantized< strategy, To, Tr >::execute(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::execute(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::execute(), GemvPretransposed< strategy, To, Tr, OutputStage >::GemvPretransposed(), generic_get_packed_size(), GemmHybridQuantizedInline< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybrid< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridQuantized< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::get_B_pretransposed_array_size(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::get_B_pretransposed_array_size(), DepthwiseDepthfirstGenericWithMultiplierBase< strategy >::get_storage_size(), GemmHybrid< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantizedInline< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantized< strategy, To, Tr >::pretranspose_B_array(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::pretranspose_B_array(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::pretranspose_B_array(), and CpuWinogradConv2dTransformWeightsKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::run_op().

70  {
71  T rem = a % b;
72 
73  if (rem) {
74  return a + b - rem;
75  } else {
76  return a;
77  }
78 }
SimpleTensor< float > b
Definition: DFT.cpp:157

◆ row_sums_indirect()

void arm_gemm::row_sums_indirect ( unsigned int  num_strings,
const unsigned int *  string_lengths,
IndirectInputArg< T >  A_arg,
size_t  M,
int32_t *  output_ptr,
const Requantize32 qp 
)

◆ to_ndcoord()

ndcoord_t arm_gemm::to_ndcoord ( const arm_compute::Window win)
inline

Convert an arm_compute::Window to an arm_gemm::NDCoord of the same max dimensions.

Parameters
[win]the arm_compute::Window we want to convert to arm_gemm::ndcoord_t
Returns
the resultant ndcoord_t

Definition at line 117 of file arm_gemm_compute_iface.hpp.

References arm_compute::mlgo::parser::end().

Referenced by CpuGemmAssemblyWrapperKernel< TypeInput, TypeOutput >::run(), and CpuGemmAssemblyWrapperKernel< TypeInput, TypeOutput >::run_nd().

118 {
119  return
120  {
121  { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
122  { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
123  { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
124  { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
125  { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
126  { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
127  };
128 }
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290

◆ to_ndrange()

ndrange_t arm_gemm::to_ndrange ( const arm_compute::Window win)
inline

Convert an arm_compute::Window to an arm_gemm::NDRange of the same max dimensions.

It should be noted that arm_compute::Window specifies a start() and an end() where as arm_gemm::ndrange_t only has a size, as a result we store the delta between the range

Parameters
[win]the arm_compute::Window we want to convert to arm_gemm::ndrange_t
Returns
the resultant ndrange_t

Definition at line 99 of file arm_gemm_compute_iface.hpp.

References arm_compute::mlgo::parser::end().

100 {
101  return
102  {
103  static_cast<unsigned int>(win[0].end() - win[0].start()),
104  static_cast<unsigned int>(win[1].end() - win[1].start()),
105  static_cast<unsigned int>(win[2].end() - win[2].start()),
106  static_cast<unsigned int>(win[3].end() - win[3].start()),
107  static_cast<unsigned int>(win[4].end() - win[4].start()),
108  static_cast<unsigned int>(win[5].end() - win[5].start())
109  };
110 }
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290

◆ to_window() [1/2]

arm_compute::Window arm_gemm::to_window ( const ndrange_t ndr)
inline

Definition at line 55 of file arm_gemm_compute_iface.hpp.

References NDRange< D >::get_size(), ndrange_max, and Window::set().

Referenced by CpuGemmAssemblyWrapperKernel< TypeInput, TypeOutput >::configure().

56 {
58 
59  for(unsigned int i = 0; i != ndrange_max; ++i)
60  {
61  //populate the window with the dimensions of the NDRange
62  win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
63  }
64 
65  return win;
66 }
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
constexpr std::size_t ndrange_max
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
Describe a multidimensional execution window.
Definition: Window.h:39

◆ to_window() [2/2]

arm_compute::Window arm_gemm::to_window ( const ndcoord_t ndc)
inline

Definition at line 74 of file arm_gemm_compute_iface.hpp.

References NDCoordinate< N >::get_position(), NDRange< D >::get_size(), ndrange_max, and Window::set().

75 {
77 
78  for(unsigned int i = 0; i != ndrange_max; ++i)
79  {
80  const auto start = ndc.get_position(i);
81  const auto size = ndc.get_size(i);
82  const auto stop = start + size;
83 
84  //populate the window with the dimensions of the NDRange
85  win.set(i, arm_compute::Window::Dimension(start, stop));
86  }
87 
88  return win;
89 }
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
constexpr std::size_t ndrange_max
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
Describe a multidimensional execution window.
Definition: Window.h:39

◆ Transform()

void Transform ( TOut *  out,
const TIn *const  in,
const int  stride,
const int  k0,
const int  kmax,
const int  x0,
const int  xmax 
)

Definition at line 116 of file transform.cpp.

119  {
120  // Redirect to a specialised implementation predicated on argument size.
122  out, in, stride, k0, kmax, x0, xmax
123  );
124 }
void Transform(TOut *out, const TIn *const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)
Definition: transform.cpp:116

Variable Documentation

◆ ndrange_max

constexpr std::size_t ndrange_max
Initial value:

Definition at line 41 of file arm_gemm_compute_iface.hpp.

Referenced by to_window().

◆ report_mutex

std::mutex report_mutex

Definition at line 32 of file misc.cpp.