Compute Library
 21.02
arm_gemm Namespace Reference

Namespaces

 utils
 

Data Structures

class  barrier
 
class  convolver
 
class  GemmHybrid
 
class  GemmHybridIndirect
 
class  GemmHybridQuantized
 
class  GemmHybridQuantizedInline
 
struct  GemmImplementation
 
struct  GemmImplementation< Top, Tret, Nothing >
 
class  GemmInterleaved
 
class  GemmInterleavedPretransposed2d
 
class  GemvBatched
 
class  GemvPretransposed
 
struct  IndirectInputArg
 
struct  IndirectOutputArg
 
struct  PerformanceParameters
 
class  QuantizeWrapper
 
class  StdTransformsFixed
 
class  StdTransformsSVE
 
struct  TransformImpl
 

Typedefs

using bfloat16 = arm_compute::bfloat16
 
template<typename strategy , typename To , typename Tr , typename OutputStage = Nothing>
using GemmInterleavedNoMerge = GemmInterleaved< strategy, To, Tr, OutputStage, false >
 
template<typename strategy , typename To , typename Tr >
using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved< strategy, To, Tr, Requantize32, false >
 
template<typename strategy , typename To , typename Tr >
using GemmInterleavedQuantized = GemmInterleaved< strategy, To, Tr, Requantize32 >
 

Enumerations

enum  VLType { None, SVE }
 

Functions

template<typename T >
void bias_adder (T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
 
template<bool DoBias, typename T >
void activator (T *out, unsigned int stride, const T *bias, Activation act, unsigned int rows, unsigned int cols)
 
template<>
const GemmImplementation< bfloat16, float > * gemm_implementation_list< bfloat16, float > ()
 
template UniqueGemmCommon< bfloat16, float > gemm< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template KernelDescription get_gemm_method< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template std::vector< KernelDescription > get_compatible_kernels< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template<>
const GemmImplementation< float, float > * gemm_implementation_list< float, float > ()
 
template UniqueGemmCommon< float, float > gemm< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template KernelDescription get_gemm_method< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template std::vector< KernelDescription > get_compatible_kernels< float, float, Nothing > (const GemmArgs &args, const Nothing &)
 
template<typename Top , typename Tret , class OutputStage = Nothing>
const GemmImplementation< Top, Tret, OutputStage > * gemm_implementation_list ()
 
template<typename Top , typename Tret , class OutputStage >
bool find_implementation (const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)
 
template<typename Top , typename Tret , class OutputStage >
std::vector< KernelDescription > get_compatible_kernels (const GemmArgs &args, const OutputStage &os)
 
template<typename Top , typename Tret , class OutputStage >
UniqueGemmCommon< Top, Tret > gemm (const GemmArgs &args, const OutputStage &os)
 
template<typename Top , typename Tret , class OutputStage >
KernelDescription get_gemm_method (const GemmArgs &args, const OutputStage &os)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn , typename TOut >
void interleave_block (TOut *&out, const TIn *const *in, size_t width, size_t height, size_t row_offset, bool first)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut >
void FixupRowSums (TOut *&out, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void IndirectInterleave (TOut *out, const TIn *const *const *ptr, unsigned int stringlen, unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void ConvolutionInterleave (TOut *out, const TIn *in, size_t in_stride, const convolver< TIn > &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void Interleave (TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)
 
template<unsigned int twidth, unsigned int height, bool sve = false, typename Tin , typename Tout >
void MergeResults (Tout *out, const Tin *in, int ldc, int y0, int ymax, int x0, int xmax, const Tout *bias, Activation act, bool append)
 
template<typename Tin , typename Tout >
void requantize_block_32 (const Requantize32 &qp, unsigned int width, unsigned int height, const Tin *input, unsigned int in_stride, Tout *output, unsigned int out_stride, const int32_t *row_bias, const int32_t *col_bias, unsigned int start_col)
 
template<typename T >
void compute_row_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *row_bias)
 
template<typename T >
void compute_col_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T *input, unsigned int in_stride, int32_t *col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)
 
template<typename T >
void row_sums_indirect (unsigned int num_strings, const unsigned int *string_lengths, IndirectInputArg< T > A_arg, size_t M, int32_t *output_ptr, const Requantize32 *qp)
 
template<unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt = VLType::None, typename TOut , typename TIn >
void Transform (TOut *out, const TIn *const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)
 
template<typename T >
iceildiv (const T a, const T b)
 
template<typename T >
roundup (const T a, const T b)
 
bool quant_no_left_shift (const Requantize32 &qp)
 
bool quant_hybrid_symmetric (const Requantize32 &qp)
 
bool quant_hybrid_asymmetric (const Requantize32 &qp)
 

Variables

std::mutex report_mutex
 

Typedef Documentation

◆ bfloat16

Definition at line 30 of file bfloat.hpp.

◆ GemmInterleavedNoMerge

using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>

Definition at line 1049 of file gemm_interleaved.hpp.

◆ GemmInterleavedPretransposedNoMergeQuantizedInline

using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>

Definition at line 1052 of file gemm_interleaved.hpp.

◆ GemmInterleavedQuantized

using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>

Definition at line 1055 of file gemm_interleaved.hpp.

Enumeration Type Documentation

◆ VLType

enum VLType
strong
Enumerator
None 
SVE 

Definition at line 55 of file utils.hpp.

Function Documentation

◆ activator()

void arm_gemm::activator ( T *  out,
unsigned int  stride,
const T *  bias,
Activation  act,
unsigned int  rows,
unsigned int  cols 
)
inline

Definition at line 40 of file bias_adder.hpp.

References bias_adder(), caffe_mnist_image_extractor::cols, tf_frozen_model_extractor::None, and caffe_mnist_image_extractor::rows.

40  {
41  if (act.type == Activation::Type::None) {
42  if (DoBias) {
43  bias_adder(out, stride, bias, rows, cols);
44  }
45  return;
46  }
47 
48  if (act.type == Activation::Type::ReLU) {
49  for (unsigned int row=0; row<rows; row++) {
50  for (unsigned int col=0; col<cols; col++) {
51  T &v = out[row * stride + col];
52  if (DoBias) {
53  v += bias[col];
54  }
55  v = std::max(static_cast<T>(0), v);
56  }
57  }
58  }
59 
60  if (act.type == Activation::Type::BoundedReLU) {
61  const T max = static_cast<T>(act.param1);
62 
63  for (unsigned int row=0; row<rows; row++) {
64  for (unsigned int col=0; col<cols; col++) {
65  T &v = out[row * stride + col];
66  if (DoBias) {
67  v += bias[col];
68  }
69  v = std::max(static_cast<T>(0), std::min(v, max));
70  }
71  }
72  }
73 }
void bias_adder(T *out, unsigned int stride, const T *bias, unsigned int rows, unsigned int cols)
Definition: bias_adder.hpp:31

◆ bias_adder()

void arm_gemm::bias_adder ( T *  out,
unsigned int  stride,
const T *  bias,
unsigned int  rows,
unsigned int  cols 
)
inline

Definition at line 31 of file bias_adder.hpp.

References caffe_mnist_image_extractor::cols, and caffe_mnist_image_extractor::rows.

Referenced by activator().

31  {
32  for (unsigned int row=0; row<rows; row++) {
33  for (unsigned int col=0; col<cols; col++) {
34  out[row * stride + col] += bias[col];
35  }
36  }
37 }

◆ compute_col_sums()

void arm_gemm::compute_col_sums ( const Requantize32 &  qp,
unsigned int  width,
unsigned int  height,
const T *  input,
unsigned int  in_stride,
int32_t *  col_bias,
unsigned int  depth,
unsigned int  multi,
unsigned int  first_col 
)

◆ compute_row_sums()

void arm_gemm::compute_row_sums ( const Requantize32 &  qp,
unsigned int  width,
unsigned int  height,
const T *  input,
unsigned int  in_stride,
int32_t *  row_bias 
)

◆ ConvolutionInterleave()

void ConvolutionInterleave ( TOut *  out,
const TIn *  in,
size_t  in_stride,
const convolver< TIn > &  conv,
const unsigned int  rounded_stringlen,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 224 of file interleave_indirect.cpp.

References offset(), convolver< T >::process_columns(), and SVE.

225  {
226  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
227 
228  auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
229 
230  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
231  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
232 
233  for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
234  // How many of the rows are active - the rest will get padded in interleave_block.
235  unsigned int active_height = std::min(ymax - ybase, height);
236  bool first = true;
237 
238  auto conv_rows = conv_cols.process_rows(ybase, active_height);
239 
240  while (!conv_rows.finished()) {
241  unsigned int width, offset;
242 
243  // Get next set of parameters
244  std::tie(width, offset) = conv_rows.next_block(row_ptrs);
245 
246  // Perform the interleave
247  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
248  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
249  } else {
250  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
251  }
252 
253  first=false;
254  }
255 
256  if (std::is_integral<TOut>::value && integrate_sums) {
257  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
258  }
259  }
260 }
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:846

◆ find_implementation()

bool arm_gemm::find_implementation ( const GemmArgs &  args,
const OutputStage &  os,
const GemmImplementation< Top, Tret, OutputStage > *&  impl 
)

Definition at line 166 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), and GemmImplementation< Top, Tret, OutputStage >::method.

Referenced by get_compatible_kernels().

166  {
167  auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
168  const GemmConfig *cfg = args._cfg;
169 
170  const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
171  uint64_t best_estimate = 0;
172 
173  for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
174  /* Skip if this implementation doesn't support these args. */
175  if (!i->do_is_supported(args, os)) {
176  continue;
177  }
178 
179  /* Skip if a specific method is requested and this is a different one. */
180  if (cfg && cfg->method != GemmMethod::DEFAULT && i->method != cfg->method) {
181  continue;
182  }
183 
184  /* Skip if a filter is to be applied and it doesn't match. */
185  if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
186  continue;
187  }
188 
189  /* Test the cycle estimate */
190  uint64_t estimate = i->do_cycle_estimate(args, os);
191 
192  /* Short circuit - if the estimate is zero, return this one immediately. */
193  if (estimate==0) {
194  impl=i;
195  return true;
196  }
197 
198  /* Otherwise, remember this is our best so far if we don't yet have
199  * a valid candidate, or we beat the estimate. */
200  if ((saved_impl == nullptr) || (estimate < best_estimate)) {
201  saved_impl = i;
202  best_estimate = estimate;
203  }
204  }
205 
206  /* Return whichever method gave the best estimate. */
207  if (saved_impl != nullptr) {
208  impl = saved_impl;
209  return true;
210  }
211 
212  return false;
213 }

◆ FixupRowSums()

void arm_gemm::FixupRowSums ( TOut *&  out,
const int32_t  row_sum_multiplier 
)
inline

Definition at line 123 of file interleave_indirect.cpp.

References SVE.

123  {
124  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
125 
126  // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
127  if (row_sum_multiplier) {
128  // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
129  // next block (post sums).
130  // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'.
131  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
132 
133  out_int32 -= height;
134  for (unsigned int i=0; i<height; i++) {
135  out_int32[i] *= row_sum_multiplier;
136  }
137  } else {
138  // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
139  // sum block. We need to insert the (zero) sums, and advance 'out'.
140  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
141 
142  for (unsigned int i=0; i<height; i++) {
143  out_int32[i] = 0;
144  }
145 
146  out_int32 += height;
147 
148  out = reinterpret_cast<TOut *>(out_int32);
149  }
150 }

◆ gemm()

UniqueGemmCommon<Top, Tret> arm_gemm::gemm ( const GemmArgs &  args,
const OutputStage &  os 
)

Definition at line 239 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::do_instantiate().

239  {
240  const GemmImplementation<Top, Tret, OutputStage> *impl;
241 
242  if (find_implementation<Top, Tret, OutputStage>(args, os, impl)) {
243  return UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os));
244  }
245 
246  return UniqueGemmCommon<Top, Tret>(nullptr);
247 }

◆ gemm< bfloat16, float, Nothing >()

template UniqueGemmCommon<bfloat16, float> arm_gemm::gemm< bfloat16, float, Nothing > ( const GemmArgs &  args,
const Nothing &   
)

◆ gemm< float, float, Nothing >()

template UniqueGemmCommon<float, float> arm_gemm::gemm< float, float, Nothing > ( const GemmArgs &  args,
const Nothing &   
)

◆ gemm_implementation_list()

const GemmImplementation<Top, Tret, OutputStage>* arm_gemm::gemm_implementation_list ( )

◆ gemm_implementation_list< bfloat16, float >()

◆ gemm_implementation_list< float, float >()

const GemmImplementation<float, float>* arm_gemm::gemm_implementation_list< float, float > ( )

◆ get_compatible_kernels()

std::vector<KernelDescription> arm_gemm::get_compatible_kernels ( const GemmArgs &  args,
const OutputStage &  os 
)

Definition at line 216 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), find_implementation(), and GemmImplementation< Top, Tret, OutputStage >::method.

216  {
217  std::vector<KernelDescription> res;
218 
219  /* Find out what the default implementation in so we can set the flag accordingly later. */
220  const GemmImplementation<Top, Tret, OutputStage> *default_impl;
221  find_implementation(args, os, default_impl);
222 
223  auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
224 
225  for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
226  /* Check that this implementation supports the presented problem. */
227 
228  if (!i->do_is_supported(args, os)) {
229  continue;
230  }
231 
232  res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
233  }
234 
235  return res;
236 }
bool find_implementation(const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)

◆ get_compatible_kernels< bfloat16, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< bfloat16, float, Nothing > ( const GemmArgs &  args,
const Nothing &   
)

◆ get_compatible_kernels< float, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< float, float, Nothing > ( const GemmArgs &  args,
const Nothing &   
)

◆ get_gemm_method()

KernelDescription arm_gemm::get_gemm_method ( const GemmArgs &  args,
const OutputStage &  os 
)

Definition at line 250 of file gemm_implementation.hpp.

References GemmImplementation< Top, Tret, OutputStage >::method, and GemmImplementation< Top, Tret, OutputStage >::name.

250  {
251  const GemmImplementation<Top, Tret, OutputStage> *impl;
252 
253  if (find_implementation<Top, Tret>(args, os, impl)) {
254  return KernelDescription(impl->method, impl->name);
255  }
256 
257  /* This shouldn't happen - there should always be at least one valid implementation. */
258  return KernelDescription();
259 }

◆ get_gemm_method< bfloat16, float, Nothing >()

template KernelDescription arm_gemm::get_gemm_method< bfloat16, float, Nothing > ( const GemmArgs &  args,
const Nothing &   
)

◆ get_gemm_method< float, float, Nothing >()

template KernelDescription arm_gemm::get_gemm_method< float, float, Nothing > ( const GemmArgs &  args,
const Nothing &   
)

◆ iceildiv()

◆ IndirectInterleave()

void IndirectInterleave ( TOut *  out,
const TIn *const *const *  ptr,
unsigned int  stringlen,
unsigned int  rounded_stringlen,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 153 of file interleave_indirect.cpp.

References SVE.

156  {
157  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
158 
159  // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
160  // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
161  // out of range rows). This allows interleave_block to use techniques like row predication, or loading all
162  // pointers and conditionally overriding the out of range ones.
163 
164  // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
165  // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be
166  // expensive in highly threaded scenarios.
167  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
168 
169  // Figure out the starting position based on k0 (with rounded length)
170  unsigned int start_string = k0 / rounded_stringlen;
171  unsigned int start_stringpos = k0 % rounded_stringlen;
172 
173  // Process blocks of 'height' height...
174  for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
175  // Height to process
176  unsigned int active_height = std::min(ymax - ybase, height);
177 
178  // Track our progress through the various strings
179  unsigned int k_left = (kmax - k0);
180  unsigned int string = start_string;
181  unsigned int stringpos = start_stringpos;
182 
183  bool first = true;
184 
185  // Prepare to call 'interleave_block' above for each string encompassed by K range
186  while (k_left > 0) {
187  // Width to process - and the width we will generate (with padding)
188  unsigned int in_width = std::min(k_left, stringlen - stringpos);
189  unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos);
190 
191  const TIn * const *row_base = ptr[string] + ybase;
192 
193  // If not all rows are valid, copy the ones that are into local array (see above comment).
194  if (active_height < height) {
195  for (unsigned int i=0; i<active_height; i++) {
196  row_ptrs[i] = ptr[string][ybase + i];
197  }
198 
199  row_base = row_ptrs;
200  }
201 
202  // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
203  // much code. However, integrated sums make no sense for non-integral types and won't ever be
204  // requested. So put a type trait check here to avoid generating pointless code.
205  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
206  interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
207  } else {
208  interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
209  }
210 
211  k_left -= out_width;
212  string++;
213  stringpos=0;
214  first=false;
215  }
216 
217  if (std::is_integral<TOut>::value && integrate_sums) {
218  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
219  }
220  }
221 }

◆ Interleave()

void Interleave ( TOut *  out,
const TIn *  in,
size_t  in_stride,
const unsigned int  y0,
const unsigned int  ymax,
const unsigned int  k0,
const unsigned int  kmax,
bool  integrate_sums,
const int32_t  row_sum_multiplier 
)

Definition at line 263 of file interleave_indirect.cpp.

References SVE.

263  {
264  const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
265 
266  // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
267  const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
268 
269  const unsigned int width=kmax-k0;
270 
271  for (unsigned int y=y0; y<ymax; y+=height) {
272  for (unsigned int r=0; r<height; r++) {
273  row_ptrs[r] = in + ((y + r) * in_stride);
274  }
275 
276  if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
277  interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
278  } else {
279  interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
280  }
281 
282  if (std::is_integral<TOut>::value && integrate_sums) {
283  FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
284  }
285  }
286 }

◆ interleave_block()

void arm_gemm::interleave_block ( TOut *&  out,
const TIn *const *  in,
size_t  width,
size_t  height,
size_t  row_offset,
bool  first 
)

Definition at line 59 of file interleave_indirect.cpp.

References SVE.

59  {
60  const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
61 
62  std::vector<int32_t> the_sums;
63 
64  if (integrate_sums) {
65  the_sums = std::vector<int32_t>(int_by, 0);
66 
67  if (!first) {
68  // In 'integrate sums' mode, we dump the sums at the end on each pass.
69 
70  // On the last pass this is correct, but on other passes it is not -
71  // so on the subsequent pass we need to take the output written by
72  // the previous pass as starting point for the sums, and then
73  // overwrite them with new interleaved data.
74  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
75 
76  // Rewind pointer to where we wrote out the sums last time.
77  out_int32 -= int_by;
78 
79  // Restore the running sums.
80  memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
81 
82  // Update the "real" pointer so that the next output will clobber the old sums.
83  out = reinterpret_cast<TOut *>(out_int32);
84  }
85  }
86 
87  for (unsigned int pos=0; pos<width; pos+=block) {
88  for (unsigned int row=0; row<int_by; row++) {
89  // Row out of range - pad 'block' entries.
90  if (row >= height) {
91  for (unsigned int col=0; col<block; col++) {
92  *out++ = 0;
93  }
94  continue;
95  }
96 
97  for (unsigned int col=0; col<block; col++) {
98  // Column out of range - pad a single entry
99  if (pos + col >= width) {
100  *out++ = 0;
101  continue;
102  }
103 
104  if (integrate_sums) {
105  the_sums[row] += in[row][row_offset + pos + col];
106  }
107 
108  *out++ = in[row][row_offset + pos + col];
109  }
110  }
111  }
112 
113  if (integrate_sums) {
114  int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
115 
116  memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
117 
118  out = reinterpret_cast<TOut *>(out_int32 + int_by);
119  }
120 }

◆ MergeResults()

void MergeResults ( Tout *  out,
const Tin *  in,
int  ldc,
int  y0,
int  ymax,
int  x0,
int  xmax,
const Tout *  bias,
Activation  act,
bool  append 
)

Definition at line 39 of file mergeresults.cpp.

References tf_frozen_model_extractor::None.

Referenced by StdTransformsSVE< TOperand, TResult, height, width_vectors, block, mmla, integrate_sums >::Merge().

39  {
40  // For SVE cases, multiply the width up by the vector length.
41  // Use the *input* type to determine this, since this will be what the kernel operated on.
42  const int width = twidth * (sve ? get_vector_length<Tin>() : 1);
43 
44  const int full_y_blocks = (ymax - y0) / height;
45  const int y_remainder = (ymax - y0) % height;
46  const int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
47 
48  const int full_x_blocks = (xmax - x0) / width;
49  const int x_remainder = (xmax - x0) % width;
50  const int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
51 
52  for (int y_block = 0; y_block < y_blocks; y_block++) {
53  int ybase = y0 + (y_block * height);
54 
55  int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
56 
57  for (int x_block = 0; x_block < x_blocks; x_block++) {
58  int xbase = x0 + (x_block * width);
59 
60  int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
61 
62  for (int row=0; row < fill_rows; row++) {
63  for (int col=0; col < fill_cols; col++) {
64  Tout &r = out[(ybase + row) * ldc + xbase + col];
65  Tout v = in[row * width + col];
66 
67  if (append) {
68  v += r;
69  }
70 
71  if (bias) {
72  v += bias[xbase + col];
73  }
74 
75  switch(act.type) {
76  default:
78  break;
79 
80  case Activation::Type::ReLU:
81  v = std::max(v, static_cast<Tout>(0));
82  break;
83 
84  case Activation::Type::BoundedReLU:
85  v = std::max(std::min(v, static_cast<Tout>(act.param1)), static_cast<Tout>(0));
86  break;
87  }
88 
89  r = v;
90  }
91  }
92 
93  in += (width * height);
94  }
95  }
96 }

◆ quant_hybrid_asymmetric()

bool arm_gemm::quant_hybrid_asymmetric ( const Requantize32 &  qp)
inline

Definition at line 107 of file utils.hpp.

References quant_no_left_shift().

107  {
108  return quant_no_left_shift(qp) /* && qp.b_offset != 0 */ && qp.per_channel_requant==false;
109 }
bool quant_no_left_shift(const Requantize32 &qp)
Definition: utils.hpp:91

◆ quant_hybrid_symmetric()

bool arm_gemm::quant_hybrid_symmetric ( const Requantize32 &  qp)
inline

Definition at line 101 of file utils.hpp.

References quant_no_left_shift().

101  {
102  return quant_no_left_shift(qp) && qp.b_offset == 0;
103 }
bool quant_no_left_shift(const Requantize32 &qp)
Definition: utils.hpp:91

◆ quant_no_left_shift()

bool arm_gemm::quant_no_left_shift ( const Requantize32 &  qp)
inline

Definition at line 91 of file utils.hpp.

Referenced by quant_hybrid_asymmetric(), and quant_hybrid_symmetric().

91  {
92  if (qp.per_channel_requant) {
93  return (qp.per_channel_left_shifts == nullptr);
94  } else {
95  return (qp.per_layer_left_shift == 0);
96  }
97 }

◆ requantize_block_32()

void arm_gemm::requantize_block_32 ( const Requantize32 &  qp,
unsigned int  width,
unsigned int  height,
const Tin *  input,
unsigned int  in_stride,
Tout *  output,
unsigned int  out_stride,
const int32_t *  row_bias,
const int32_t *  col_bias,
unsigned int  start_col 
)

◆ roundup()

T arm_gemm::roundup ( const T  a,
const T  b 
)
inline

Definition at line 45 of file utils.hpp.

References arm_compute::test::validation::b.

Referenced by NEWinogradConvolutionLayer::configure(), NEWinogradLayerTransformOutputKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::configure(), GemmHybrid< strategy, To, Tr >::estimate_cycles(), GemmInterleavedPretransposed2d< strategy, To, Tr >::estimate_cycles(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::estimate_cycles(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::estimate_cycles(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::execute(), GemmHybrid< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridQuantizedInline< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridQuantized< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::get_B_pretransposed_array_size(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::get_B_pretransposed_array_size(), GemmHybrid< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantizedInline< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantized< strategy, To, Tr >::pretranspose_B_array(), and NEWinogradLayerTransformWeightsKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::run().

45  {
46  T rem = a % b;
47 
48  if (rem) {
49  return a + b - rem;
50  } else {
51  return a;
52  }
53 }
SimpleTensor< float > b
Definition: DFT.cpp:157

◆ row_sums_indirect()

void arm_gemm::row_sums_indirect ( unsigned int  num_strings,
const unsigned int *  string_lengths,
IndirectInputArg< T >  A_arg,
size_t  M,
int32_t *  output_ptr,
const Requantize32 *  qp 
)

◆ Transform()

void arm_gemm::Transform ( TOut *  out,
const TIn *const  in,
const int  stride,
const int  k0,
const int  kmax,
const int  x0,
const int  xmax 
)

Definition at line 109 of file transform.hpp.

112  {
113  // Redirect to a specialised implementation predicated on argument size.
115  out, in, stride, k0, kmax, x0, xmax
116  );
117 }
void Transform(TOut *out, const TIn *const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)
Definition: transform.hpp:109

Variable Documentation

◆ report_mutex

std::mutex report_mutex

Definition at line 32 of file misc.cpp.