Namespaces
	utils

Data Structures
class	barrier

class	convolver

class	GemmHybrid

class	GemmHybridIndirect

class	GemmHybridQuantized

class	GemmHybridQuantizedInline

struct	GemmImplementation

struct	GemmImplementation< Top, Tret, Nothing >

class	GemmInterleaved

class	GemmInterleavedPretransposed2d

class	GemvBatched

class	GemvPretransposed

struct	IndirectInputArg

struct	IndirectOutputArg

struct	PerformanceParameters

class	QuantizeWrapper

class	StdTransformsFixed

class	StdTransformsSVE

struct	TransformImpl

Typedefs
using	bfloat16 = arm_compute::bfloat16

template<typename strategy , typename To , typename Tr , typename OutputStage = Nothing>
using	GemmInterleavedNoMerge = GemmInterleaved< strategy, To, Tr, OutputStage, false >

template<typename strategy , typename To , typename Tr >
using	GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved< strategy, To, Tr, Requantize32, false >

template<typename strategy , typename To , typename Tr >
using	GemmInterleavedQuantized = GemmInterleaved< strategy, To, Tr, Requantize32 >

Enumerations
enum	VLType { None, SVE }

Functions
template<typename T >
void	bias_adder (T out, unsigned int stride, const T bias, unsigned int rows, unsigned int cols)

template<bool DoBias, typename T >
void	activator (T out, unsigned int stride, const T bias, Activation act, unsigned int rows, unsigned int cols)

template<>
const GemmImplementation< bfloat16, float > *	gemm_implementation_list< bfloat16, float > ()

template UniqueGemmCommon< bfloat16, float >	gemm< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)

template KernelDescription	get_gemm_method< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)

template std::vector< KernelDescription >	get_compatible_kernels< bfloat16, float, Nothing > (const GemmArgs &args, const Nothing &)

template<>
const GemmImplementation< float, float > *	gemm_implementation_list< float, float > ()

template UniqueGemmCommon< float, float >	gemm< float, float, Nothing > (const GemmArgs &args, const Nothing &)

template KernelDescription	get_gemm_method< float, float, Nothing > (const GemmArgs &args, const Nothing &)

template std::vector< KernelDescription >	get_compatible_kernels< float, float, Nothing > (const GemmArgs &args, const Nothing &)

template<typename Top , typename Tret , class OutputStage = Nothing>
const GemmImplementation< Top, Tret, OutputStage > *	gemm_implementation_list ()

template<typename Top , typename Tret , class OutputStage >
bool	find_implementation (const GemmArgs &args, const OutputStage &os, const GemmImplementation< Top, Tret, OutputStage > *&impl)

template<typename Top , typename Tret , class OutputStage >
std::vector< KernelDescription >	get_compatible_kernels (const GemmArgs &args, const OutputStage &os)

template<typename Top , typename Tret , class OutputStage >
UniqueGemmCommon< Top, Tret >	gemm (const GemmArgs &args, const OutputStage &os)

template<typename Top , typename Tret , class OutputStage >
KernelDescription	get_gemm_method (const GemmArgs &args, const OutputStage &os)

template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn , typename TOut >
void	interleave_block (TOut &out, const TIn const *in, size_t width, size_t height, size_t row_offset, bool first)

template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut >
void	FixupRowSums (TOut *&out, const int32_t row_sum_multiplier)

template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void	IndirectInterleave (TOut out, const TIn const const ptr, unsigned int stringlen, unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)

template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void	ConvolutionInterleave (TOut out, const TIn in, size_t in_stride, const convolver< TIn > &conv, const unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)

template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn , typename TOut >
void	Interleave (TOut out, const TIn in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier)

template<unsigned int twidth, unsigned int height, bool sve = false, typename Tin , typename Tout >
void	MergeResults (Tout out, const Tin in, int ldc, int y0, int ymax, int x0, int xmax, const Tout *bias, Activation act, bool append)

template<typename Tin , typename Tout >
void	requantize_block_32 (const Requantize32 &qp, unsigned int width, unsigned int height, const Tin input, unsigned int in_stride, Tout output, unsigned int out_stride, const int32_t row_bias, const int32_t col_bias, unsigned int start_col)

template<typename T >
void	compute_row_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T input, unsigned int in_stride, int32_t row_bias)

template<typename T >
void	compute_col_sums (const Requantize32 &qp, unsigned int width, unsigned int height, const T input, unsigned int in_stride, int32_t col_bias, unsigned int depth, unsigned int multi, unsigned int first_col)

template<typename T >
void	row_sums_indirect (unsigned int num_strings, const unsigned int string_lengths, IndirectInputArg< T > A_arg, size_t M, int32_t output_ptr, const Requantize32 *qp)

template<unsigned int IntBy, unsigned int BlockBy, bool Transposed, VLType vlt = VLType::None, typename TOut , typename TIn >
void	Transform (TOut out, const TIn const in, const int stride, const int k0, const int kmax, const int x0, const int xmax)

template<typename T >
T	iceildiv (const T a, const T b)

template<typename T >
T	roundup (const T a, const T b)

bool	quant_no_left_shift (const Requantize32 &qp)

bool	quant_hybrid_symmetric (const Requantize32 &qp)

bool	quant_hybrid_asymmetric (const Requantize32 &qp)

Variables
std::mutex	report_mutex

Typedef Documentation

◆ bfloat16

using bfloat16 = arm_compute::bfloat16

Definition at line 30 of file bfloat.hpp.

◆ GemmInterleavedNoMerge

using GemmInterleavedNoMerge = GemmInterleaved<strategy, To, Tr, OutputStage, false>

Definition at line 1049 of file gemm_interleaved.hpp.

◆ GemmInterleavedPretransposedNoMergeQuantizedInline

using GemmInterleavedPretransposedNoMergeQuantizedInline = GemmInterleaved<strategy, To, Tr, Requantize32, false>

Definition at line 1052 of file gemm_interleaved.hpp.

◆ GemmInterleavedQuantized

using GemmInterleavedQuantized = GemmInterleaved<strategy, To, Tr, Requantize32>

Definition at line 1055 of file gemm_interleaved.hpp.

Enumeration Type Documentation

◆ VLType

enum VLType

strong

Enumerator
None
SVE

Definition at line 55 of file utils.hpp.

                   {
     None,
     SVE,
 };

Function Documentation

◆ activator()

void arm_gemm::activator	(	T *	out,
		unsigned int	stride,
		const T *	bias,
		Activation	act,
		unsigned int	rows,
		unsigned int	cols
	)

inline

Definition at line 40 of file bias_adder.hpp.

                                                                                                                         {
     if (act.type == Activation::Type::None) {
         if (DoBias) {
             bias_adder(out, stride, bias, rows, cols);
         }
         return;
     }
 
     if (act.type == Activation::Type::ReLU) {
         for (unsigned int row=0; row<rows; row++) {
             for (unsigned int col=0; col<cols; col++) {
                 T &v = out[row * stride + col];
                 if (DoBias) {
                     v += bias[col];
                 }
                 v = std::max(static_cast<T>(0), v);
             }
         }
     }
 
     if (act.type == Activation::Type::BoundedReLU) {
         const T max = static_cast<T>(act.param1);
 
         for (unsigned int row=0; row<rows; row++) {
             for (unsigned int col=0; col<cols; col++) {
                 T &v = out[row * stride + col];
                 if (DoBias) {
                     v += bias[col];
                 }
                 v = std::max(static_cast<T>(0), std::min(v, max));
             }
         }
     }
 }

References bias_adder(), caffe_mnist_image_extractor::cols, tf_frozen_model_extractor::None, and caffe_mnist_image_extractor::rows.

◆ bias_adder()

void arm_gemm::bias_adder	(	T *	out,
		unsigned int	stride,
		const T *	bias,
		unsigned int	rows,
		unsigned int	cols
	)

inline

Definition at line 31 of file bias_adder.hpp.

                                                                                                          {
     for (unsigned int row=0; row<rows; row++) {
         for (unsigned int col=0; col<cols; col++) {
             out[row * stride + col] += bias[col];
         }
     }
 }

References caffe_mnist_image_extractor::cols, and caffe_mnist_image_extractor::rows.

Referenced by activator(), and GemmHybrid< strategy, To, Tr >::execute().

◆ compute_col_sums()

void arm_gemm::compute_col_sums	(	const Requantize32 &	qp,
		unsigned int	width,
		unsigned int	height,
		const T *	input,
		unsigned int	in_stride,
		int32_t *	col_bias,
		unsigned int	depth,
		unsigned int	multi,
		unsigned int	first_col
	)

Referenced by GemmHybridQuantizedInline< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantized< strategy, To, Tr >::pretranspose_B_array(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::pretranspose_B_array(), and GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::pretranspose_B_array().

◆ compute_row_sums()

void arm_gemm::compute_row_sums	(	const Requantize32 &	qp,
		unsigned int	width,
		unsigned int	height,
		const T *	input,
		unsigned int	in_stride,
		int32_t *	row_bias
	)

Referenced by GemmHybridQuantized< strategy, To, Tr >::execute().

◆ ConvolutionInterleave()

void ConvolutionInterleave	(	TOut *	out,
		const TIn *	in,
		size_t	in_stride,
		const convolver< TIn > &	conv,
		const unsigned int	rounded_stringlen,
		const unsigned int	y0,
		const unsigned int	ymax,
		const unsigned int	k0,
		const unsigned int	kmax,
		bool	integrate_sums,
		const int32_t	row_sum_multiplier
	)

Definition at line 224 of file interleave_indirect.cpp.

                                                                                                                                                                {
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
 
     auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen);
 
     // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
 
     for (unsigned int ybase = y0; ybase < ymax; ybase += height) {
         // How many of the rows are active - the rest will get padded in interleave_block.
         unsigned int active_height   = std::min(ymax - ybase, height);
         bool first = true;
 
         auto conv_rows = conv_cols.process_rows(ybase, active_height);
 
         while (!conv_rows.finished()) {
             unsigned int width, offset;
 
             // Get next set of parameters
             std::tie(width, offset) = conv_rows.next_block(row_ptrs);
 
             // Perform the interleave
             if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
                 interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first);
             } else {
                 interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first);
             }
 
             first=false;
         }
 
         if (std::is_integral<TOut>::value && integrate_sums) {
             FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
         }
     }
 }

References offset(), convolver< T >::process_columns(), and SVE.

◆ find_implementation()

bool arm_gemm::find_implementation	(	const GemmArgs &	args,
		const OutputStage &	os,
		const GemmImplementation< Top, Tret, OutputStage > *&	impl
	)

Definition at line 166 of file gemm_implementation.hpp.

                                                                                                                                 {
     auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
     const GemmConfig *cfg = args._cfg;
 
     const GemmImplementation<Top, Tret, OutputStage> *saved_impl = nullptr;
     uint64_t best_estimate = 0;
 
     for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Skip if this implementation doesn't support these args. */
         if (!i->do_is_supported(args, os)) {
             continue;
         }
 
         /* Skip if a specific method is requested and this is a different one. */
         if (cfg && cfg->method != GemmMethod::DEFAULT && i->method != cfg->method) {
             continue;
         }
 
         /* Skip if a filter is to be applied and it doesn't match. */
         if (cfg && cfg->filter != "" && !strstr(i->name, cfg->filter.c_str())) {
             continue;
         }
 
         /* Test the cycle estimate */
         uint64_t estimate = i->do_cycle_estimate(args, os);
 
         /* Short circuit - if the estimate is zero, return this one immediately. */
         if (estimate==0) {
             impl=i;
             return true;
         }
 
         /* Otherwise, remember this is our best so far if we don't yet have
          * a valid candidate, or we beat the estimate.  */
         if ((saved_impl == nullptr) || (estimate < best_estimate)) {
             saved_impl = i;
             best_estimate = estimate;
         }
     }
 
     /* Return whichever method gave the best estimate. */
     if (saved_impl != nullptr) {
         impl = saved_impl;
         return true;
     }
 
     return false;
 }

References GemmTuner::args, GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), and GemmImplementation< Top, Tret, OutputStage >::method.

Referenced by get_compatible_kernels().

◆ FixupRowSums()

void arm_gemm::FixupRowSums	(	TOut *&	out,
		const int32_t	row_sum_multiplier
	)

inline

Definition at line 123 of file interleave_indirect.cpp.

                                                                         {
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
 
     // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not.
     if (row_sum_multiplier) {
         // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the
         // next block (post sums).
         // We need to go back and apply the multiplier to the computed sums.  We don't need to change 'out'.
         int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
 
         out_int32 -= height;
         for (unsigned int i=0; i<height; i++) {
             out_int32[i] *= row_sum_multiplier;
         }
     } else {
         // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the
         // sum block.  We need to insert the (zero) sums, and advance 'out'.
         int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
 
         for (unsigned int i=0; i<height; i++) {
             out_int32[i] = 0;
         }
 
         out_int32 += height;
 
         out = reinterpret_cast<TOut *>(out_int32);
     }
 }

References SVE.

◆ gemm()

UniqueGemmCommon<Top, Tret> arm_gemm::gemm	(	const GemmArgs &	args,
		const OutputStage &	os
	)

Definition at line 239 of file gemm_implementation.hpp.

                                                                               {
     const GemmImplementation<Top, Tret, OutputStage> *impl;
 
     if (find_implementation<Top, Tret, OutputStage>(args, os, impl)) {
         return UniqueGemmCommon<Top, Tret>(impl->do_instantiate(args, os));
     }
 
     return UniqueGemmCommon<Top, Tret>(nullptr);
 }

References GemmTuner::args, and GemmImplementation< Top, Tret, OutputStage >::do_instantiate().

◆ gemm< bfloat16, float, Nothing >()

template UniqueGemmCommon<bfloat16, float> arm_gemm::gemm< bfloat16, float, Nothing >	(	const GemmArgs &	args,
		const Nothing &
	)

◆ gemm< float, float, Nothing >()

template UniqueGemmCommon<float, float> arm_gemm::gemm< float, float, Nothing >	(	const GemmArgs &	args,
		const Nothing &
	)

◆ gemm_implementation_list()

const GemmImplementation<Top, Tret, OutputStage>* arm_gemm::gemm_implementation_list ( )

◆ gemm_implementation_list< bfloat16, float >()

const GemmImplementation<bfloat16, float>* arm_gemm::gemm_implementation_list< bfloat16, float > ( )

Definition at line 122 of file gemm_bf16.cpp.

                                                                                        {
     return gemm_bf16_methods;
 }

◆ gemm_implementation_list< float, float >()

const GemmImplementation<float, float>* arm_gemm::gemm_implementation_list< float, float > ( )

Definition at line 189 of file gemm_fp32.cpp.

                                                                                  {
     return gemm_fp32_methods;
 }

◆ get_compatible_kernels()

std::vector<KernelDescription> arm_gemm::get_compatible_kernels	(	const GemmArgs &	args,
		const OutputStage &	os
	)

Definition at line 216 of file gemm_implementation.hpp.

                                                                                                  {
     std::vector<KernelDescription> res;
 
     /* Find out what the default implementation in so we can set the flag accordingly later. */
     const GemmImplementation<Top, Tret, OutputStage> *default_impl;
     find_implementation(args, os, default_impl);
 
     auto gemms = gemm_implementation_list<Top, Tret, OutputStage>();
 
     for (const GemmImplementation<Top, Tret, OutputStage> *i = gemms; i->method != GemmMethod::DEFAULT; i++) {
         /* Check that this implementation supports the presented problem. */
 
         if (!i->do_is_supported(args, os)) {
             continue;
         }
 
         res.push_back(KernelDescription(i->method, i->name, i==default_impl, i->do_cycle_estimate(args, os)));
     }
 
     return res;
 }

References GemmTuner::args, GemmImplementation< Top, Tret, OutputStage >::do_cycle_estimate(), find_implementation(), and GemmImplementation< Top, Tret, OutputStage >::method.

◆ get_compatible_kernels< bfloat16, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< bfloat16, float, Nothing >	(	const GemmArgs &	args,
		const Nothing &
	)

◆ get_compatible_kernels< float, float, Nothing >()

template std::vector<KernelDescription> arm_gemm::get_compatible_kernels< float, float, Nothing >	(	const GemmArgs &	args,
		const Nothing &
	)

◆ get_gemm_method()

KernelDescription arm_gemm::get_gemm_method	(	const GemmArgs &	args,
		const OutputStage &	os
	)

Definition at line 250 of file gemm_implementation.hpp.

                                                                                {
     const GemmImplementation<Top, Tret, OutputStage> *impl;
 
     if (find_implementation<Top, Tret>(args, os, impl)) {
         return KernelDescription(impl->method, impl->name);
     }
 
     /* This shouldn't happen - there should always be at least one valid implementation. */
     return KernelDescription();
 }

References GemmTuner::args, GemmImplementation< Top, Tret, OutputStage >::method, and GemmImplementation< Top, Tret, OutputStage >::name.

◆ get_gemm_method< bfloat16, float, Nothing >()

template KernelDescription arm_gemm::get_gemm_method< bfloat16, float, Nothing >	(	const GemmArgs &	args,
		const Nothing &
	)

◆ get_gemm_method< float, float, Nothing >()

template KernelDescription arm_gemm::get_gemm_method< float, float, Nothing >	(	const GemmArgs &	args,
		const Nothing &
	)

◆ iceildiv()

T arm_gemm::iceildiv	(	const T	a,
		const T	b
	)

inline

Definition at line 40 of file utils.hpp.

                                         {
     return (a + b - 1) / b;
 }

References arm_compute::test::validation::b.

Referenced by NEWinogradConvolutionLayer::configure(), GemmInterleavedPretransposed2d< strategy, To, Tr >::estimate_cycles(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::estimate_cycles(), GemvPretransposed< strategy, To, Tr >::execute(), GemmInterleavedPretransposed2d< strategy, To, Tr >::GemmInterleavedPretransposed2d(), GemmInterleavedPretransposed2d< strategy, To, Tr >::get_B_pretransposed_array_size(), GemvPretransposed< strategy, To, Tr >::get_window_size(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::get_window_size(), and GemmInterleavedPretransposed2d< strategy, To, Tr >::pretranspose_B_array().

◆ IndirectInterleave()

void IndirectInterleave	(	TOut *	out,
		const TIn const const *	ptr,
		unsigned int	stringlen,
		unsigned int	rounded_stringlen,
		const unsigned int	y0,
		const unsigned int	ymax,
		const unsigned int	k0,
		const unsigned int	kmax,
		bool	integrate_sums,
		const int32_t	row_sum_multiplier
	)

Definition at line 153 of file interleave_indirect.cpp.

                                                           {
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
 
     // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input
     // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for
     // out of range rows).  This allows interleave_block to use techniques like row predication, or loading all
     // pointers and conditionally overriding the out of range ones.
 
     // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of
     // range reads.  Avoid this with a local buffer to use in last-rows cases.  Use alloca as a std::vector can be
     // expensive in highly threaded scenarios.
     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
 
     // Figure out the starting position based on k0 (with rounded length)
     unsigned int start_string      = k0 / rounded_stringlen;
     unsigned int start_stringpos   = k0 % rounded_stringlen;
 
     // Process blocks of 'height' height...
     for (unsigned int ybase = y0; ybase < ymax; ybase+=height) {
         // Height to process
         unsigned int active_height = std::min(ymax - ybase, height);
 
         // Track our progress through the various strings
         unsigned int k_left    = (kmax - k0);
         unsigned int string    = start_string;
         unsigned int stringpos = start_stringpos;
 
         bool first = true;
 
         // Prepare to call 'interleave_block' above for each string encompassed by K range
         while (k_left > 0) {
             // Width to process - and the width we will generate (with padding)
             unsigned int in_width   = std::min(k_left, stringlen - stringpos);
             unsigned int out_width  = std::min(k_left, rounded_stringlen - stringpos);
 
             const TIn * const *row_base = ptr[string] + ybase;
 
             // If not all rows are valid, copy the ones that are into local array (see above comment).
             if (active_height < height) {
                 for (unsigned int i=0; i<active_height; i++) {
                     row_ptrs[i] = ptr[string][ybase + i];
                 }
 
                 row_base = row_ptrs;
             }
 
             // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too
             // much code.  However, integrated sums make no sense for non-integral types and won't ever be
             // requested.  So put a type trait check here to avoid generating pointless code.
             if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
                 interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first);
             } else {
                 interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first);
             }
 
             k_left -= out_width;
             string++;
             stringpos=0;
             first=false;
         }
 
         if (std::is_integral<TOut>::value && integrate_sums) {
             FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
         }
     }
 }

References SVE.

◆ Interleave()

void Interleave	(	TOut *	out,
		const TIn *	in,
		size_t	in_stride,
		const unsigned int	y0,
		const unsigned int	ymax,
		const unsigned int	k0,
		const unsigned int	kmax,
		bool	integrate_sums,
		const int32_t	row_sum_multiplier
	)

Definition at line 263 of file interleave_indirect.cpp.

                                                                                                                                                                                                                    {
     const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
 
     // Use alloca here as a std::vector can be expensive in highly threaded scenarios.
     const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *)));
 
     const unsigned int width=kmax-k0;
 
     for (unsigned int y=y0; y<ymax; y+=height) {
         for (unsigned int r=0; r<height; r++) {
             row_ptrs[r] = in + ((y + r) * in_stride);
         }
 
         if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) {
             interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
         } else {
             interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true);
         }
 
         if (std::is_integral<TOut>::value && integrate_sums) {
             FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier);
         }
     }
 }

References SVE.

◆ interleave_block()

void arm_gemm::interleave_block	(	TOut *&	out,
		const TIn const	in,
		size_t	width,
		size_t	height,
		size_t	row_offset,
		bool	first
	)

Definition at line 59 of file interleave_indirect.cpp.

                                                                                                                        {
     const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : 1);
 
     std::vector<int32_t> the_sums;
 
     if (integrate_sums) {
         the_sums = std::vector<int32_t>(int_by, 0);
 
         if (!first) {
             // In 'integrate sums' mode, we dump the sums at the end on each pass.
 
             // On the last pass this is correct, but on other passes it is not -
             // so on the subsequent pass we need to take the output written by
             // the previous pass as starting point for the sums, and then
             // overwrite them with new interleaved data.
             int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
 
             // Rewind pointer to where we wrote out the sums last time.
             out_int32 -= int_by;
 
             // Restore the running sums.
             memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t));
 
             // Update the "real" pointer so that the next output will clobber the old sums.
             out = reinterpret_cast<TOut *>(out_int32);
         }
     }
 
     for (unsigned int pos=0; pos<width; pos+=block) {
         for (unsigned int row=0; row<int_by; row++) {
             // Row out of range - pad 'block' entries.
             if (row >= height) {
                 for (unsigned int col=0; col<block; col++) {
                     *out++ = 0;
                 }
                 continue;
             }
 
             for (unsigned int col=0; col<block; col++) {
                 // Column out of range - pad a single entry
                 if (pos + col >= width) {
                     *out++ = 0;
                     continue;
                 }
 
                 if (integrate_sums) {
                     the_sums[row] += in[row][row_offset + pos + col];
                 }
 
                 *out++ = in[row][row_offset + pos + col];
             }
         }
     }
 
     if (integrate_sums) {
         int32_t *out_int32 = reinterpret_cast<int32_t *>(out);
 
         memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t));
 
         out = reinterpret_cast<TOut *>(out_int32 + int_by);
     }
 }

References SVE.

◆ MergeResults()

void MergeResults	(	Tout *	out,
		const Tin *	in,
		int	ldc,
		int	y0,
		int	ymax,
		int	x0,
		int	xmax,
		const Tout *	bias,
		Activation	act,
		bool	append
	)

Definition at line 39 of file mergeresults.cpp.

                                                                                                                                           {
     // For SVE cases, multiply the width up by the vector length.
     // Use the *input* type to determine this, since this will be what the kernel operated on.
     const int width = twidth * (sve ? get_vector_length<Tin>() : 1);
 
     const int full_y_blocks = (ymax - y0) / height;
     const int y_remainder = (ymax - y0) % height;
     const int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
 
     const int full_x_blocks = (xmax - x0) / width;
     const int x_remainder = (xmax - x0) % width;
     const int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
 
     for (int y_block = 0; y_block < y_blocks; y_block++) {
         int ybase = y0 + (y_block * height);
 
         int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
 
         for (int x_block = 0; x_block < x_blocks; x_block++) {
             int xbase = x0 + (x_block * width);
 
             int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
 
             for (int row=0; row < fill_rows; row++) {
                 for (int col=0; col < fill_cols; col++) {
                     Tout &r = out[(ybase + row) * ldc + xbase + col];
                     Tout v = in[row * width + col];
 
                     if (append) {
                         v += r;
                     }
 
                     if (bias) {
                         v += bias[xbase + col];
                     }
 
                     switch(act.type) {
                         default:
                         case Activation::Type::None:
                             break;
 
                         case Activation::Type::ReLU:
                             v = std::max(v, static_cast<Tout>(0));
                             break;
 
                         case Activation::Type::BoundedReLU:
                             v = std::max(std::min(v, static_cast<Tout>(act.param1)), static_cast<Tout>(0));
                             break;
                     }
 
                     r = v;
                 }
             }
 
             in += (width * height);
         }
     }
 }

References tf_frozen_model_extractor::None.

Referenced by StdTransformsSVE< TOperand, TResult, height, width_vectors, block, mmla, integrate_sums >::Merge().

◆ quant_hybrid_asymmetric()

bool arm_gemm::quant_hybrid_asymmetric ( const Requantize32 & qp )

inline

Definition at line 107 of file utils.hpp.

                                                             {
     return quant_no_left_shift(qp) /*  && qp.b_offset != 0 */ && qp.per_channel_requant==false;
 }

References quant_no_left_shift().

◆ quant_hybrid_symmetric()

bool arm_gemm::quant_hybrid_symmetric ( const Requantize32 & qp )

inline

Definition at line 101 of file utils.hpp.

                                                            {
     return quant_no_left_shift(qp) && qp.b_offset == 0;
 }

References quant_no_left_shift().

◆ quant_no_left_shift()

bool arm_gemm::quant_no_left_shift ( const Requantize32 & qp )

inline

Definition at line 91 of file utils.hpp.

                                                         {
     if (qp.per_channel_requant) {
         return (qp.per_channel_left_shifts == nullptr);
     } else {
         return (qp.per_layer_left_shift == 0);
     }
 }

Referenced by quant_hybrid_asymmetric(), and quant_hybrid_symmetric().

◆ requantize_block_32()

void arm_gemm::requantize_block_32	(	const Requantize32 &	qp,
		unsigned int	width,
		unsigned int	height,
		const Tin *	input,
		unsigned int	in_stride,
		Tout *	output,
		unsigned int	out_stride,
		const int32_t *	row_bias,
		const int32_t *	col_bias,
		unsigned int	start_col
	)

Referenced by GemmHybridQuantized< strategy, To, Tr >::execute().

◆ roundup()

T arm_gemm::roundup	(	const T	a,
		const T	b
	)

inline

Definition at line 45 of file utils.hpp.

                                        {
     T rem = a % b;
 
     if (rem) {
         return a + b - rem;
     } else {
         return a;
     }
 }

References arm_compute::test::validation::b.

Referenced by NEWinogradConvolutionLayer::configure(), GemmHybrid< strategy, To, Tr >::estimate_cycles(), GemmInterleavedPretransposed2d< strategy, To, Tr >::estimate_cycles(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::estimate_cycles(), GemvPretransposed< strategy, To, Tr >::execute(), GemmHybrid< strategy, To, Tr >::execute(), GemmHybridQuantized< strategy, To, Tr >::execute(), GemmHybridQuantizedInline< strategy, To, Tr >::execute(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::execute(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::execute(), GemmHybrid< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridQuantizedInline< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridQuantized< strategy, To, Tr >::get_B_pretransposed_array_size(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::get_B_pretransposed_array_size(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::get_B_pretransposed_array_size(), GemmHybrid< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantizedInline< strategy, To, Tr >::pretranspose_B_array(), GemmHybridQuantized< strategy, To, Tr >::pretranspose_B_array(), GemmHybridIndirect< strategy, To, Tr, OutputStage, SeparateQuantize >::pretranspose_B_array(), GemmInterleaved< strategy, To, Tr, OutputStage, MergeStep, ForceThreadColumns >::pretranspose_B_array(), and NEWinogradLayerTransformWeightsKernel< T, OutputTileRows, OutputTileCols, KernelRows, KernelCols >::run().

◆ row_sums_indirect()

void arm_gemm::row_sums_indirect	(	unsigned int	num_strings,
		const unsigned int *	string_lengths,
		IndirectInputArg< T >	A_arg,
		size_t	M,
		int32_t *	output_ptr,
		const Requantize32 *	qp
	)

◆ Transform()

void arm_gemm::Transform	(	TOut *	out,
		const TIn *const	in,
		const int	stride,
		const int	k0,
		const int	kmax,
		const int	x0,
		const int	xmax
	)

Definition at line 109 of file transform.hpp.

   {
   // Redirect to a specialised implementation predicated on argument size.
   TransformImpl<IntBy, BlockBy, Transposed, sizeof(TOut), sizeof(TIn), vlt>::Transform(
     out, in, stride, k0, kmax, x0, xmax
   );
 }

Variable Documentation

◆ report_mutex

std::mutex report_mutex

Definition at line 32 of file misc.cpp.

Namespaces

Data Structures

Typedefs

Enumerations

Functions

Variables

Typedef Documentation

◆ bfloat16

◆ GemmInterleavedNoMerge

◆ GemmInterleavedPretransposedNoMergeQuantizedInline

◆ GemmInterleavedQuantized

Enumeration Type Documentation

◆ VLType

Function Documentation

◆ activator()

◆ bias_adder()

◆ compute_col_sums()

◆ compute_row_sums()

◆ ConvolutionInterleave()

◆ find_implementation()

◆ FixupRowSums()

◆ gemm()

◆ gemm< bfloat16, float, Nothing >()

◆ gemm< float, float, Nothing >()

◆ gemm_implementation_list()

◆ gemm_implementation_list< bfloat16, float >()

◆ gemm_implementation_list< float, float >()

◆ get_compatible_kernels()

◆ get_compatible_kernels< bfloat16, float, Nothing >()

◆ get_compatible_kernels< float, float, Nothing >()

◆ get_gemm_method()

◆ get_gemm_method< bfloat16, float, Nothing >()

◆ get_gemm_method< float, float, Nothing >()

◆ iceildiv()

◆ IndirectInterleave()

◆ Interleave()

◆ interleave_block()

◆ MergeResults()

◆ quant_hybrid_asymmetric()

◆ quant_hybrid_symmetric()

◆ quant_no_left_shift()

◆ requantize_block_32()

◆ roundup()

◆ row_sums_indirect()

◆ Transform()

Variable Documentation

◆ report_mutex