Compute Library
 21.11
arm_gemm Namespace Reference

Namespaces

 utils
 

Data Structures

struct  Activation
 
class  barrier
 
struct  ConvolutionParameters
 
class  convolver
 
struct  GemmArgs
 
class  GemmCommon
 
struct  GemmConfig
 
class  GemmHybrid
 
class  GemmHybridIndirect
 
class  GemmHybridQuantized
 
class  GemmHybridQuantizedInline
 
struct  GemmImplementation
 
struct  GemmImplementation< Top, Tret, Nothing >
 
class  GemmInterleaved
 
class  GemmInterleavedPretransposed2d
 
class  GemvBatched
 
class  GemvPretransposed
 
class  IGemmCommon
 
struct  IndirectInputArg
 
struct  IndirectOutputArg
 
struct  KernelDescription
 
class  NDCoordinate
 NDCoordinate builds upon a range, but specifies a starting position in addition to a size which it inherits from NDRange. More...
 
class  NDRange
 
struct  Nothing
 
struct  PerformanceParameters
 
class  QuantizeWrapper
 
struct  Requantize32
 
class  StdTransformsFixed
 
class  StdTransformsSVE
 

Typedefs

using bfloat16 = arm_compute::bfloat16
 
template<typename strategy , typename To , typename Tr , typename OutputStage = Nothing>
using GemmInterleavedNoMerge = GemmInterleaved< strategy, To, Tr, OutputStage, false >
 
template<typename strategy , typename To , typename Tr >
using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved< strategy, To, Tr, Requantize32, false >
 
template<typename strategy , typename To , typename Tr >
using GemmInterleavedQuantized = GemmInterleaved< strategy, To, Tr, Requantize32 >
 
template<typename Top , typename Tret >
using UniqueGemmCommon = std::unique_ptr< GemmCommon< Top, Tret > >
 
using ndrange_t = NDRange< ndrange_max >
 
using ndcoord_t = NDCoordinate< ndrange_max >
 

Enumerations

enum  VLType { None, SVE }
 
enum  GemmMethod {
  DEFAULT, GEMV_BATCHED, GEMV_PRETRANSPOSED, GEMV_NATIVE_TRANSPOSED,
  GEMM_NATIVE, GEMM_HYBRID, GEMM_INTERLEAVED, GEMM_INTERLEAVED_2D,
  QUANTIZE_WRAPPER, QUANTIZE_WRAPPER_2D, GEMM_HYBRID_QUANTIZED
}
 

Functions

template<typename T >
void bias_adder (T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
 
template<bool DoBias, typename T >
void activator (T *out, unsigned int stride, const T *bias, Activation act, unsigned int rows, unsigned int cols)
 
template<>
const GemmImplementation< bfloat16, float > * gemm_implementation_list< bfloat16, float > ()
 
template UniqueGemmCommon< bfloat16, float > gemm< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template KernelDescription get_gemm_method< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template std::vector< KernelDescriptionget_compatible_kernels< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template<>
const GemmImplementation< float, float > * gemm_implementation_list< float, float > ()
 
template UniqueGemmCommon< float, float > gemm< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template KernelDescription get_gemm_method< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template std::vector< KernelDescriptionget_compatible_kernels< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template<typename Top , typename Tret , class OutputStage = Nothing>
const GemmImplementation< Top, Tret, OutputStage > * gemm_implementation_list ()
 
template<typename Top , typename Tret , class OutputStage >
bool find_implementation (const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)
 
template<typename Top , typename Tret , class OutputStage >
std::vector< KernelDescriptionget_compatible_kernels (const GemmArgs &args, const OutputStage &os)
 
template<typename Top , typename Tret , class OutputStage >
UniqueGemmCommon< Top, Tret > gemm (const GemmArgs &args, const OutputStage &os)
 
template<typename Top , typename Tret , class OutputStage >
KernelDescription get_gemm_method (const GemmArgs &args, const OutputStage &os)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn , typename TOut >
void interleave_block (TOut *&out, const TIn *const *in, size_t width, size_t height, size_t row_offset, bool first)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut >
void FixupRowSums (TOut *&out, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void IndirectInterleave (TOut *out, const TIn *const *const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void ConvolutionInterleave (TOut *out, const TIn *in, size_t in_stride, const convolver< TIn > &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void Interleave (TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int twidth, unsigned int height, bool sve = false, typename Tin , typename Tout >
void MergeResults (Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout *bias, Activation act, bool append)
 
template<typename Tin , typename Tout >
void requantize_block_32 (const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
 
template<typename T >
void compute_row_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *row_bias)
 
template<typename T >
void compute_col_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
 
template<typename T >
void row_sums_indirect (unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg< T > A_arg, size_t M, int32_t *output_ptr, const Requantize32 *qp)
 
template<unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt = VLType::None, typename TOut , typename TIn >
void Transform (TOut *out, const TIn *const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)
 
template<typename T >
std::string get_type_name ()
 
template<typename T >
iceildiv (const T a, const T b)
 
template<typename T >
roundup (const T a, const T b)
 
bool quant_no_left_shift (const Requantize32 &qp)
 
bool quant_hybrid_symmetric (const Requantize32 &qp)
 
bool quant_hybrid_asymmetric (const Requantize32 &qp)
 
arm_compute::Window to_window (const ndrange_t &ndr)
 
arm_compute::Window to_window (const ndcoord_t &ndc)
 
ndrange_t to_ndrange (const arm_compute::Window &win)
 Convert an arm_compute::Window to an arm_gemm::NDRange of the same max dimensions. More...
 
ndcoord_t to_ndcoord (const arm_compute::Window &win)
 Convert an arm_compute::Window to an arm_gemm::NDCoord of the same max dimensions. More...
 

Variables

std::mutex report_mutex
 
constexpr std::size_t ndrange_max
 

Typedef Documentation

◆ bfloat16

Definition at line 30 of file bfloat.hpp.

◆ GemmInterleavedNoMerge

using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>

Definition at line 1075 of file gemm_interleaved.hpp.

◆ GemmInterleavedPretransposedNoMergeQuantizedInline

Definition at line 1078 of file gemm_interleaved.hpp.

◆ GemmInterleavedQuantized

Definition at line 1081 of file gemm_interleaved.hpp.

◆ ndcoord_t

typedef NDCoordinate< 6 > ndcoord_t

Definition at line 45 of file arm_gemm_compute_iface.hpp.

◆ ndrange_t

typedef NDRange< 6 > ndrange_t

Definition at line 44 of file arm_gemm_compute_iface.hpp.

◆ UniqueGemmCommon

using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >

Definition at line 174 of file arm_gemm.hpp.

Enumeration Type Documentation

◆ GemmMethod

◆ VLType

enum VLType
strong
Enumerator
None 
SVE 

Definition at line 80 of file utils.hpp.

Function Documentation

◆ activator()

void arm_gemm::activator ( T *  out,
unsigned int  stride,
const T *  bias,
Activation  act,
unsigned int  rows,
unsigned int  cols 
)
inline

Definition at line 40 of file bias_adder.hpp.

References bias_adder(), Activation::BoundedReLU, caffe_mnist_image_extractor::cols, Activation::None, Activation::param1, Activation::ReLU, caffe_mnist_image_extractor::rows, and Activation::type.

40  {
41  if (act.type == Activation::Type::None) {
42  if (DoBias) {
43  bias_adder(out, stride, bias, rows, cols);
44  }
45  return;
46  }
47 
48  if (act.type == Activation::Type::ReLU) {
49  for (unsigned int row=0; row<rows; row++) {
50  for (unsigned int col=0; col<cols; col++) {
51  T &v = out[row * stride + col];
52  if (DoBias) {
53  v += bias[col];
54  }
55  v = std::max(static_cast<T>(0), v);
56  }
57  }
58  }
59 
60  if (act.type == Activation::Type::BoundedReLU) {
61  const T max = static_cast<T>(act.param1);
62 
63  for (unsigned int row=0; row<rows; row++) {
64  for (unsigned int col=0; col<cols; col++) {
65  T &v = out[row * stride + col];
66  if (DoBias) {
67  v += bias[col];
68  }
69  v = std::max(static_cast<T>(0), std::min(v, max));
70  }
71  }
72  }
73 }
void bias_adder(T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
Definition: bias_adder.hpp:31

◆ bias_adder()

void arm_gemm::bias_adder ( T *  out,
unsigned int  stride,
const T *  bias,
unsigned int  rows,
unsigned int  cols 
)
inline

Definition at line 31 of file bias_adder.hpp.

References caffe_mnist_image_extractor::cols, and caffe_mnist_image_extractor::rows.

Referenced by activator(), and GemmHybrid< strategy, To, Tr >::execute().

31  {
32  for (unsigned int row=0; row<rows; row++) {
33  for (unsigned int col=0; col<cols; col++) {
34  out[row * stride + col] += bias[col];
35  }
36  }
37 }

◆ compute_col_sums()

void arm_gemm::compute_col_sums ( const Requantize32 qp,
unsigned int  width,
unsigned int  height,
const T *  input,
unsigned int  in_stride,
int32_t *  col_bias,
unsigned int  depth,
unsigned int  multi,
unsigned int  first_col 
)

◆ compute_row_sums()

void arm_gemm::compute_row_sums ( const Requantize32 qp,
unsigned int  width,
unsigned int  height,
const T *  input,
unsigned int  in_stride,
int32_t *  row_bias 
)

◆ ConvolutionInterleave()

void ConvolutionInterleave ( TOut *  out,
const TIn *  in,
size_t  in_stride,
const convolver< TIn > &  conv,
const unsigned int  rounded_stringlen,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 224 of file interleave_indirect.cpp.

References offset(), convolver< T >::process_columns(), and SVE.

225  {
226  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
227 
228  auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
229 
230  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
231  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
232 
233  for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
234  // How many of the rows are active - the rest will get padded in interleave_block.
235  unsigned int active_height = std::min(ymax - ybase, height);
236  bool first = true;
237 
238  auto conv_rows = conv_cols.process_rows(ybase, active_height);
239 
240  while (!conv_rows.finished()) {
241  unsigned int width, offset;
242 
243  // Get next set of parameters
244  std::tie(width, offset) = conv_rows.next_block(row_ptrs);
245 
246  // Perform the interleave
247  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
248  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
249  } else {
250  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
251  }
252 
253  first=false;
254  }
255 
256  if (std::is_integral<TOut>::value && integrate_sums) {
257  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
258  }
259  }
260 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:1069

◆ find_implementation()

bool arm_gemm::find_implementation ( const GemmArgs args,
const OutputStage &  os,
const GemmImplementation< Top, Tret, OutputStage > *&  impl 
)

Definition at line 166 of file gemm_implementation.hpp.

References GemmArgs::_cfg, DEFAULT, GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), and GemmImplementation< Top, Tret, OutputStage >::method.

Referenced by get_compatible_kernels().

166  {
167  auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
168  const GemmConfig *cfg = args._cfg;
169 
170  const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
171  uint64_t best_estimate = 0;
172 
173  for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
174  /* Skip if this implementation doesn't support these args. */
175  if (!i->do_is_supported(args, os)) {
176  continue;
177  }
178 
179  /* Skip if a specific method is requested and this is a different one. */
180  if (cfg && cfg->method != GemmMethod::DEFAULT && i->method != cfg->method) {
181  continue;
182  }
183 
184  /* Skip if a filter is to be applied and it doesn't match. */
185  if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
186  continue;
187  }
188 
189  /* Test the cycle estimate */
190  uint64_t estimate = i->do_cycle_estimate(args, os);
191 
192  /* Short circuit - if the estimate is zero, return this one immediately. */
193  if (estimate==0) {
194  impl=i;
195  return true;
196  }
197 
198  /* Otherwise, remember this is our best so far if we don't yet have
199  * a valid candidate, or we beat the estimate. */
200  if ((saved_impl == nullptr) || (estimate < best_estimate)) {
201  saved_impl = i;
202  best_estimate = estimate;
203  }
204  }
205 
206  /* Return whichever method gave the best estimate. */
207  if (saved_impl != nullptr) {
208  impl = saved_impl;
209  return true;
210  }
211 
212  return false;
213 }

◆ FixupRowSums()

void arm_gemm::FixupRowSums ( TOut *&  out,
const int32_t  row_sum_multiplier 
)
inline

Definition at line 123 of file interleave_indirect.cpp.

References SVE.

123  {
124  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
125 
126  // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
127  if (row_sum_multiplier) {
128  // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
129  // next block (post sums).
130  // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
131  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
132 
133  out_int32 -= height;
134  for (unsigned int i=0; i<height; i++) {
135  out_int32[i] *= row_sum_multiplier;
136  }
137  } else {
138  // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
139  // sum block. We need to insert the (zero) sums, and advance 'out'.
140  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
141 
142  for (unsigned int i=0; i<height; i++) {
143  out_int32[i] = 0;
144  }
145 
146  out_int32 += height;
147 
148  out = reinterpret_cast<TOut *>(out_int32);
149  }
150 }

◆ gemm()

UniqueGemmCommon< Top, Tret > gemm ( const GemmArgs args,
const OutputStage &  os 
)

Definition at line 239 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::do_instantiate().

Referenced by arm_compute::test::validation::DATA_TEST_CASE(), and arm_compute::test::validation::TEST_CASE().

239  {
240  const GemmImplementation<Top, Tret, OutputStage> *impl;
241 
242  if (find_implementation<Top, Tret, OutputStage>(args, os, impl)) {
243  return UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os));
244  }
245 
246  return UniqueGemmCommon<Top, Tret>(nullptr);
247 }

◆ gemm< bfloat16, float, Nothing >()

template UniqueGemmCommon<bfloat16, float> arm_gemm::gemm< bfloat16, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ gemm< float, float, Nothing >()

template UniqueGemmCommon<float, float> arm_gemm::gemm< float, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ gemm_implementation_list()

const GemmImplementation<Top, Tret, OutputStage>* arm_gemm::gemm_implementation_list ( )

◆ gemm_implementation_list< bfloat16, float >()

◆ gemm_implementation_list< float, float >()

const GemmImplementation<float, float>* arm_gemm::gemm_implementation_list< float, float > ( )

◆ get_compatible_kernels()

std::vector< KernelDescription > get_compatible_kernels ( const GemmArgs args,
const OutputStage &  os 
)

Definition at line 216 of file gemm_implementation.hpp.

References DEFAULT, GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), find_implementation(), and GemmImplementation< Top, Tret, OutputStage >::method.

216  {
217  std::vector<KernelDescription> res;
218 
219  /* Find out what the default implementation in so we can set the flag accordingly later. */
220  const GemmImplementation<Top, Tret, OutputStage> *default_impl;
221  find_implementation(args, os, default_impl);
222 
223  auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
224 
225  for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
226  /* Check that this implementation supports the presented problem. */
227 
228  if (!i->do_is_supported(args, os)) {
229  continue;
230  }
231 
232  res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
233  }
234 
235  return res;
236 }
bool find_implementation(const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)

◆ get_compatible_kernels< bfloat16, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< bfloat16, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ get_compatible_kernels< float, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< float, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ get_gemm_method()

KernelDescription get_gemm_method ( const GemmArgs args,
const OutputStage &  os 
)

Definition at line 250 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::method, and GemmImplementation< Top, Tret, OutputStage >::name.

250  {
251  const GemmImplementation<Top, Tret, OutputStage> *impl;
252 
253  if (find_implementation<Top, Tret>(args, os, impl)) {
254  return KernelDescription(impl->method, impl->name);
255  }
256 
257  /* This shouldn't happen - there should always be at least one valid implementation. */
258  return KernelDescription();
259 }

◆ get_gemm_method< bfloat16, float, Nothing >()

◆ get_gemm_method< float, float, Nothing >()

template KernelDescription arm_gemm::get_gemm_method< float, float, Nothing > ( const GemmArgs args,
const Nothing  
)

◆ get_type_name()

std::string arm_gemm::get_type_name ( )

Definition at line 42 of file utils.hpp.

42  {
43 #ifdef __GNUC__
44  std::string s = __PRETTY_FUNCTION__;
45 
46  auto start = s.find("cls_");
47 
48  if (start==std::string::npos) {
49  return "(unknown)";
50  }
51 
52  for(size_t x = start+4; x<s.size(); x++) {
53  if (s[x] == ';' || s[x] == ']') {
54  return s.substr(start+4, x-(start+4));
55  }
56  }
57 
58  return "(unknown)";
59 #else
60  return "(unsupported)";
61 #endif
62 }

◆ iceildiv()

T arm_gemm::iceildiv ( const T  a,
const T  b 
)
inline

◆ IndirectInterleave()

void IndirectInterleave ( TOut *  out,
const TIn *const *const *  ptr,
unsigned int  stringlen,
unsigned int  rounded_stringlen,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 153 of file interleave_indirect.cpp.

References SVE.

156  {
157  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
158 
159  // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
160  // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
161  // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
162  // pointers and conditionally overriding the out of range ones.
163 
164  // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
165  // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
166  // expensive in highly threaded scenarios.
167  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
168 
169  // Figure out the starting position based on k0 (with rounded length)
170  unsigned int start_string = k0 / rounded_stringlen;
171  unsigned int start_stringpos = k0 % rounded_stringlen;
172 
173  // Process blocks of 'height' height...
174  for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
175  // Height to process
176  unsigned int active_height = std::min(ymax - ybase, height);
177 
178  // Track our progress through the various strings
179  unsigned int k_left = (kmax - k0);
180  unsigned int string = start_string;
181  unsigned int stringpos = start_stringpos;
182 
183  bool first = true;
184 
185  // Prepare to call 'interleave_block' above for each string encompassed by K range
186  while (k_left > 0) {
187  // Width to process - and the width we will generate (with padding)
188  unsigned int in_width = std::min(k_left, stringlen - stringpos);
189  unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
190 
191  const TIn * const *row_base = ptr[string] + ybase;
192 
193  // If not all rows are valid, copy the ones that are into local array (see above comment).
194  if (active_height < height) {
195  for (unsigned int i=0; i<active_height; i++) {
196  row_ptrs[i] = ptr[string][ybase + i];
197  }
198 
199  row_base = row_ptrs;
200  }
201 
202  // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
203  // much code. However, integrated sums make no sense for non-integral types and won't ever be
204  // requested. So put a type trait check here to avoid generating pointless code.
205  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
206  interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
207  } else {
208  interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
209  }
210 
211  k_left -= out_width;
212  string++;
213  stringpos=0;
214  first=false;
215  }
216 
217  if (std::is_integral<TOut>::value && integrate_sums) {
218  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
219  }
220  }
221 }

◆ Interleave()

void Interleave ( TOut *  out,
const TIn *  in,
size_t  in_stride,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 263 of file interleave_indirect.cpp.

References SVE.

263  {
264  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
265 
266  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
267  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
268 
269  const unsigned int width=kmax-k0;
270 
271  for (unsigned int y=y0; y<ymax; y+=height) {
272  for (unsigned int r=0; r<height; r++) {
273  row_ptrs[r] = in + ((y + r) * in_stride);
274  }
275 
276  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
277  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
278  } else {
279  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280  }
281 
282  if (std::is_integral<TOut>::value && integrate_sums) {
283  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
284  }
285  }
286 }

◆ interleave_block()

void arm_gemm::interleave_block ( TOut *&  out,
const TIn *const *  in,
size_t  width,
size_t  height,
size_t  row_offset,
bool  first 
)

Definition at line 59 of file interleave_indirect.cpp.

References SVE.

59  {
60  const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
61 
62  std::vector<int32_t> the_sums;
63 
64  if (integrate_sums) {
65  the_sums = std::vector<int32_t>(int_by, 0);
66 
67  if (!first) {
68  // In 'integrate sums' mode, we dump the sums at the end on each pass.
69 
70  // On the last pass this is correct, but on other passes it is not -
71  // so on the subsequent pass we need to take the output written by
72  // the previous pass as starting point for the sums, and then
73  // overwrite them with new interleaved data.
74  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
75 
76  // Rewind pointer to where we wrote out the sums last time.
77  out_int32 -= int_by;
78 
79  // Restore the running sums.
80  memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
81 
82  // Update the "real" pointer so that the next output will clobber the old sums.
83  out = reinterpret_cast<TOut *>(out_int32);
84  }
85  }
86 
87  for (unsigned int pos=0; pos<width; pos+=block) {
88  for (unsigned int row=0; row<int_by; row++) {
89  // Row out of range - pad 'block' entries.
90  if (row >= height) {
91  for (unsigned int col=0; col<block; col++) {
92  *out++ = 0;
93  }
94  continue;
95  }
96 
97  for (unsigned int col=0; col<block; col++) {
98  // Column out of range - pad a single entry
99  if (pos + col >= width) {
100  *out++ = 0;
101  continue;
102  }
103 
104  if (integrate_sums) {
105  the_sums[row] += in[row][row_offset + pos + col];
106  }
107 
108  *out++ = in[row][row_offset + pos + col];
109  }
110  }
111  }
112 
113  if (integrate_sums) {
114  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
115 
116  memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
117 
118  out = reinterpret_cast<TOut *>(out_int32 + int_by);
119  }
120 }

◆ MergeResults()

void MergeResults ( Tout *  out,
const Tin *  in,
int  ldc,
int  y0,
int  ymax,
int  x0,
int  xmax,
const Tout *  bias,
Activation  act,
bool  append 
)

Definition at line 38 of file mergeresults.cpp.

References Activation::BoundedReLU, Activation::None, Activation::param1, Activation::ReLU, and Activation::type.

Referenced by StdTransformsSVE< TOperand, TResult, height, width_vectors, block, mmla, integrate_sums >::Merge().

38  {
39  // NOTE: The following code is disabled to avoid calling get_vector_length(), so templated MergeResults will not
40  // be correct for SVE cases. This is OK as we have specialisations for all needed SVE cases anyway.
41  //
42  // For SVE cases, multiply the width up by the vector length.
43  // Use the *input* type to determine this, since this will be what the kernel operated on.
44  // const int width = twidth * (sve ? get_vector_length<Tin>() : 1);
45  const int width = twidth;
46 
47  const int full_y_blocks = (ymax - y0) / height;
48  const int y_remainder = (ymax - y0) % height;
49  const int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
50 
51  const int full_x_blocks = (xmax - x0) / width;
52  const int x_remainder = (xmax - x0) % width;
53  const int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
54 
55  for (int y_block = 0; y_block < y_blocks; y_block++) {
56  int ybase = y0 + (y_block * height);
57 
58  int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
59 
60  for (int x_block = 0; x_block < x_blocks; x_block++) {
61  int xbase = x0 + (x_block * width);
62 
63  int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
64 
65  for (int row=0; row < fill_rows; row++) {
66  for (int col=0; col < fill_cols; col++) {
67  Tout &r = out[(ybase + row) * ldc + xbase + col];
68  Tout v = in[row * width + col];
69 
70  if (append) {
71  v += r;
72  }
73 
74  if (bias) {
75  v += bias[xbase + col];
76  }
77 
78  switch(act.type) {
79  default:
81  break;
82 
83  case Activation::Type::ReLU:
84  v = std::max(v, static_cast<Tout>(0));
85  break;
86 
87  case Activation::Type::BoundedReLU:
88  v = std::max(std::min(v, static_cast<Tout>(act.param1)), static_cast<Tout>(0));
89  break;
90  }
91 
92  r = v;
93  }
94  }
95 
96  in += (width * height);
97  }
98  }
99 }

◆ quant_hybrid_asymmetric()

bool arm_gemm::quant_hybrid_asymmetric ( const Requantize32 qp)
inline

Definition at line 132 of file utils.hpp.

References Requantize32::per_channel_requant, and quant_no_left_shift().

132  {
133  return quant_no_left_shift(qp) /* && qp.b_offset != 0 */ && qp.per_channel_requant==false;
134 }
bool quant_no_left_shift(const Requantize32 &qp)
Definition: utils.hpp:116

◆ quant_hybrid_symmetric()

bool arm_gemm::quant_hybrid_symmetric ( const Requantize32 qp)
inline

Definition at line 126 of file utils.hpp.

References Requantize32::b_offset, and quant_no_left_shift().

126  {
127  return quant_no_left_shift(qp) && qp.b_offset == 0;
128 }
bool quant_no_left_shift(const Requantize32 &qp)
Definition: utils.hpp:116

◆ quant_no_left_shift()

bool arm_gemm::quant_no_left_shift ( const Requantize32 qp)
inline

Definition at line 116 of file utils.hpp.

References Requantize32::per_channel_left_shifts, Requantize32::per_channel_requant, and Requantize32::per_layer_left_shift.

Referenced by quant_hybrid_asymmetric(), and quant_hybrid_symmetric().

116  {
117  if (qp.per_channel_requant) {
118  return (qp.per_channel_left_shifts == nullptr);
119  } else {
120  return (qp.per_layer_left_shift == 0);
121  }
122 }

◆ requantize_block_32()

void arm_gemm::requantize_block_32 ( const Requantize32 qp,
unsigned int  width,
unsigned int  height,
const Tin *  input,
unsigned int  in_stride,
Tout *  output,
unsigned int  out_stride,
const int32_t *  row_bias,
const int32_t *  col_bias,
unsigned int  start_col 
)

◆ roundup()

T arm_gemm::roundup ( const T  a,
const T  b 
)
inline

Definition at line 70 of file utils.hpp.

References arm_compute::test::validation::b.

Referenced by CpuWinogradConv2d::configure(), CpuWinogradConv2dTransformOutputKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::configure(), GemmHybrid< strategy, To, Tr >::estimate_cycles(), GemmInterleavedPretransposed2d< strategy, To, Tr >::estimate_cycles(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::estimate_cycles(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::estimate_cycles(), PoolingDepthfirstGeneric< strategy >::execute(), PoolingDepthfirstGenericQuantized< strategy >::execute(), PoolingDepthfirst< strategy >::execute(), GemvPretransposed< strategy, To, Tr, OutputStage >::execute(), GemmHybrid< strategy, To, Tr >::execute(), GemmHybridQuantizedInline< strategy, To, Tr >::execute(), GemmHybridQuantized< strategy, To, Tr >::execute(), DepthwiseDepthfirstWithMultiplier< strategy >::execute(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::execute(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::execute(), GemvPretransposed< strategy, To, Tr, OutputStage >::GemvPretransposed(), generic_get_packed_size(), GemmHybridQuantizedInline< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybrid< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridQuantized< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::get_B_pretransposed_array_size(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::get_B_pretransposed_array_size(), DepthwiseDepthfirst< TInput, TWeight, TOutput, TAccum >::get_storage_size(), DepthwiseDepthfirstGenericBase< Strategy, OutputRows, OutputCols >::get_storage_size(), DepthwiseDepthfirstWithMultiplier< strategy >::get_storage_size(), DepthwiseDepthfirstGenericWithMultiplierBase< strategy >::get_storage_size(), GemmHybrid< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantizedInline< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantized< strategy, To, Tr >::pretranspose_B_array(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::pretranspose_B_array(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::pretranspose_B_array(), and CpuWinogradConv2dTransformWeightsKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::run_op().

70  {
71  T rem = a % b;
72 
73  if (rem) {
74  return a + b - rem;
75  } else {
76  return a;
77  }
78 }
SimpleTensor< float > b
Definition: DFT.cpp:157

◆ row_sums_indirect()

void arm_gemm::row_sums_indirect ( unsigned int  num_strings,
const unsigned int *  string_lengths,
IndirectInputArg< T >  A_arg,
size_t  M,
int32_t *  output_ptr,
const Requantize32 qp 
)

◆ to_ndcoord()

ndcoord_t arm_gemm::to_ndcoord ( const arm_compute::Window win)
inline

Convert an arm_compute::Window to an arm_gemm::NDCoord of the same max dimensions.

Parameters
[win]the arm_compute::Window we want to convert to arm_gemm::ndcoord_t
Returns
the resultant ndcoord_t

Definition at line 117 of file arm_gemm_compute_iface.hpp.

References arm_compute::mlgo::parser::end().

Referenced by CpuGemmAssemblyWrapperKernel< TypeInput, TypeOutput >::run(), and CpuGemmAssemblyWrapperKernel< TypeInput, TypeOutput >::run_nd().

118 {
119  return
120  {
121  { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
122  { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
123  { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
124  { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
125  { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
126  { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
127  };
128 }
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290

◆ to_ndrange()

ndrange_t arm_gemm::to_ndrange ( const arm_compute::Window win)
inline

Convert an arm_compute::Window to an arm_gemm::NDRange of the same max dimensions.

It should be noted that arm_compute::Window specifies a start() and an end() where as arm_gemm::ndrange_t only has a size, as a result we store the delta between the range

Parameters
[win]the arm_compute::Window we want to convert to arm_gemm::ndrange_t
Returns
the resultant ndrange_t

Definition at line 99 of file arm_gemm_compute_iface.hpp.

References arm_compute::mlgo::parser::end().

100 {
101  return
102  {
103  static_cast<unsigned int>(win[0].end() - win[0].start()),
104  static_cast<unsigned int>(win[1].end() - win[1].start()),
105  static_cast<unsigned int>(win[2].end() - win[2].start()),
106  static_cast<unsigned int>(win[3].end() - win[3].start()),
107  static_cast<unsigned int>(win[4].end() - win[4].start()),
108  static_cast<unsigned int>(win[5].end() - win[5].start())
109  };
110 }
void end(TokenStream &in, bool &valid)
Definition: MLGOParser.cpp:290

◆ to_window() [1/2]

arm_compute::Window arm_gemm::to_window ( const ndrange_t ndr)
inline

Definition at line 55 of file arm_gemm_compute_iface.hpp.

References NDRange< D >::get_size(), ndrange_max, and Window::set().

Referenced by CpuGemmAssemblyWrapperKernel< TypeInput, TypeOutput >::configure().

56 {
58 
59  for(unsigned int i = 0; i != ndrange_max; ++i)
60  {
61  //populate the window with the dimensions of the NDRange
62  win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
63  }
64 
65  return win;
66 }
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
constexpr std::size_t ndrange_max
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
Describe a multidimensional execution window.
Definition: Window.h:39

◆ to_window() [2/2]

arm_compute::Window arm_gemm::to_window ( const ndcoord_t ndc)
inline

Definition at line 74 of file arm_gemm_compute_iface.hpp.

References NDCoordinate< N >::get_position(), NDRange< D >::get_size(), ndrange_max, and Window::set().

75 {
77 
78  for(unsigned int i = 0; i != ndrange_max; ++i)
79  {
80  const auto start = ndc.get_position(i);
81  const auto size = ndc.get_size(i);
82  const auto stop = start + size;
83 
84  //populate the window with the dimensions of the NDRange
85  win.set(i, arm_compute::Window::Dimension(start, stop));
86  }
87 
88  return win;
89 }
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
constexpr std::size_t ndrange_max
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
Describe a multidimensional execution window.
Definition: Window.h:39

◆ Transform()

void Transform ( TOut *  out,
const TIn *const  in,
const int  stride,
const int  k0,
const int  kmax,
const int  x0,
const int  xmax 
)

Definition at line 114 of file transform.cpp.

117  {
118  // Redirect to a specialised implementation predicated on argument size.
120  out, in, stride, k0, kmax, x0, xmax
121  );
122 }
void Transform(TOut *out, const TIn *const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)
Definition: transform.cpp:114

Variable Documentation

◆ ndrange_max

constexpr std::size_t ndrange_max
Initial value:

Definition at line 41 of file arm_gemm_compute_iface.hpp.

Referenced by to_window().

◆ report_mutex

std::mutex report_mutex

Definition at line 32 of file misc.cpp.