24.02.1
|
Go to the documentation of this file.
42 Status validate_arguments_matrix_a_reduction(
const ITensorInfo *
src,
43 const ITensorInfo *
dst,
44 const GEMMLowpReductionKernelInfo &
info)
52 if (
dst->total_size() > 0)
56 dst->dimension(0) !=
src->dimension(1),
57 "Output vector must have length equal to the number of rows of the input matrix");
61 Status validate_arguments_matrix_b_reduction(
const ITensorInfo *
src,
62 const ITensorInfo *
dst,
63 const GEMMLowpReductionKernelInfo &
info)
71 if (
dst->total_size() > 0)
75 dst->dimension(0) !=
src->dimension(0),
76 "Output vector must have length equal to the number of columns of the input matrix");
90 _scalar =
info.scalar;
91 _mul_by_scalar =
info.mul_by_scalar;
93 switch (
src->data_type())
96 _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<uint8_t>;
101 _func = &CpuGemmLowpMatrixAReductionKernel::run_internal<int8_t>;
111 ICpuKernel::configure(win);
122 template <
typename T>
123 void CpuGemmLowpMatrixAReductionKernel::run_internal(
const ITensor *
src,
133 Window win_input(collapsed_window);
148 const T *matrix_a =
reinterpret_cast<const T *
>(
149 (in.ptr() +
id.x() *
src->info()->strides_in_bytes()[1] +
id.y() *
src->info()->strides_in_bytes()[2]));
152 asm volatile(
"PLD [%0, #128*4]" ::
"r"(matrix_a));
157 for (; i <= (_k - 16); i += 16)
171 sum_row +=
static_cast<TAcc
>(matrix_a[i]);
174 #if defined(__aarch64__)
176 sum_row += wrapper::vaddv(vsum_row);
182 #endif // __aarch64__
190 *(
reinterpret_cast<int *
>(out.ptr())) =
static_cast<int32_t
>(sum_row);
209 return "CpuGemmLowpMatrixAReductionKernel";
220 _scalar =
info.scalar;
221 _mul_by_scalar =
info.mul_by_scalar;
226 switch (
src->data_type())
229 _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<uint8_t>;
234 _func = &CpuGemmLowpMatrixBReductionKernel::run_internal<int8_t>;
245 ICpuKernel::configure(win);
256 template <
typename T>
257 void CpuGemmLowpMatrixBReductionKernel::run_internal(
const ITensor *
src,
269 const auto width_matrix_b =
static_cast<int>(
src->info()->dimension(0));
270 const auto in_b_stride =
static_cast<int>(
src->info()->strides_in_bytes()[1]);
273 const int window_start_x = 16 *
info.thread_id;
274 const int window_step_x = 16 *
info.num_threads;
276 const int window_end_x =
ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
278 Window win_out(collapsed_window);
292 if (
id.x() > width_matrix_b)
301 wrapper::vdup_n(
static_cast<TAcc
>(0), wrapper::traits::vector_128_tag{}),
302 wrapper::vdup_n(
static_cast<TAcc
>(0), wrapper::traits::vector_128_tag{}),
303 wrapper::vdup_n(
static_cast<TAcc
>(0), wrapper::traits::vector_128_tag{})};
305 const auto *matrix_b =
reinterpret_cast<const T *
>(inb.ptr() +
id.y() *
src->info()->strides_in_bytes()[2]);
308 asm volatile(
"PLD [%0, #128*4]" ::
"r"(matrix_b));
309 asm volatile(
"PLD [%0, #128*4]" ::
"r"(matrix_b + in_b_stride));
313 if ((width_matrix_b -
id.x()) >= 16)
318 for (; i <= (_k - 4); i += 4)
331 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 1 * in_b_stride));
332 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 2 * in_b_stride));
333 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 3 * in_b_stride));
334 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 4 * in_b_stride));
341 {
wrapper::vdup_n(
static_cast<TIAcc
>(0), wrapper::traits::vector_128_tag{}),
342 wrapper::vdup_n(
static_cast<TIAcc
>(0), wrapper::traits::vector_128_tag{})};
360 matrix_b += 4 * in_b_stride;
378 matrix_b += in_b_stride;
384 for (
int i = 0; i < _k; ++i)
386 auto left_over_cols = width_matrix_b -
id.x();
387 auto l = left_over_cols;
388 for (
auto k = 0; k < 4 && l; ++k)
390 for (
auto j = 0; j < 4 && l; ++j, --l)
392 sum_col[k][j] += matrix_b[left_over_cols - l];
395 matrix_b += in_b_stride;
408 auto vector_sum_col =
reinterpret_cast<int32_t *
>(out.ptr());
409 if ((width_matrix_b -
id.x()) >= 16)
418 auto left_over = width_matrix_b -
id.x();
419 for (
auto k = 0; k < 4 && left_over; ++k)
421 for (
auto j = 0; j < 4 && left_over; ++j, --left_over)
423 *(vector_sum_col + k * 4 + j) = sum_col[k][j];
445 return "CpuGemmLowpMatrixBReductionKernel";
Class to describe a number of elements in each dimension.
@ QSYMM8_PER_CHANNEL
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
SimpleTensor< float > src
decltype(strategy::transforms) typedef type
uint16x8_t vmovl(const uint8x8_t &a)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
@ QASYMM8
quantized, asymmetric fixed-point 8-bit number unsigned
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
@ QSYMM8
quantized, symmetric fixed-point 8-bit number
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed=nullptr) const
Collapse the dimensions between first and last if possible.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Interface for CPU tensor.
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
uint8x16_t vloadq(const uint8_t *ptr)
Includes all wrapper headers at once.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Initialise the kernel's input and output.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Create the appropriate SIMD vector given its type and size in terms of bits.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&...iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
int16x4_t vreinterpret(const uint16x4_t &a)
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Iterator updated by execute_window_loop for each window element.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
@ QASYMM8_SIGNED
quantized, asymmetric fixed-point 8-bit number signed
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
uint16x8_t vaddl(const uint8x8_t &a, const uint8x8_t &b)
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Describe one of the image's dimensions with a start, end and step.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
const char * name() const override
Name of the kernel.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
const Window & window() const
The maximum window the kernel can be executed on.
Information about executing thread and CPU.
uint8x8_t vgetlow(const uint8x16_t val)
void vstore(uint8_t *ptr, uint8x8_t val)
Describe a multidimensional execution window.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Copyright (c) 2017-2024 Arm Limited.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
@ S32
signed 32-bit number
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
const char * name() const override
Name of the kernel.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint16x4_t vpaddl(const uint8x8_t &a)
typename promote< T >::type promote_t
Get promoted type.
void configure(const ITensorInfo *src, ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Initialise the kernel's input and output.
unsigned int num_elems_processed_per_iteration
uint8x8_t vgethigh(const uint8x16_t val)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration.