38 Status validate_arguments_matrix_a_reduction(
const ITensorInfo *
input,
const ITensorInfo *output)
43 if(output->total_size() > 0)
46 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(1),
"Output vector must have length equal to the number of rows of the input matrix");
50 Status validate_arguments_matrix_b_reduction(
const ITensorInfo *input,
const ITensorInfo *output)
55 if(output->total_size() > 0)
58 ARM_COMPUTE_RETURN_ERROR_ON_MSG(output->dimension(0) != input->dimension(0),
"Output vector must have length equal to the number of columns of the input matrix");
65 : _input(), _output(), _k(0), _scalar(0), _mul_by_scalar(false)
76 _output = vector_sum_row;
87 INEKernel::configure(win);
106 Window win_input(collapsed_window);
112 Iterator out(_output, collapsed_window);
122 asm volatile(
"PLD [%0, #128*4]" ::
"r"(matrix_a));
127 for(; i <= (_k - 16); i += 16)
141 sum_row +=
static_cast<TAcc
>(matrix_a[i]);
144 #if defined(__aarch64__) 146 sum_row += wrapper::vaddv(vsum_row);
152 #endif // __aarch64__ 160 *(
reinterpret_cast<int *
>(out.
ptr())) =
static_cast<int32_t
>(sum_row);
174 run_internal<uint8_t>(
window);
179 run_internal<int8_t>(
window);
194 _output = vector_sum_col;
208 INEKernel::configure(win);
219 template <
typename T>
220 void NEGEMMLowpMatrixBReductionKernel::run_internal(
const Window &window,
const ThreadInfo &
info)
229 const auto width_matrix_b =
static_cast<int>(_input->
info()->
dimension(0));
233 const int window_start_x = 16 * info.
thread_id;
236 const int window_end_x =
ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x;
238 Window win_out(collapsed_window);
250 if(
id.x() > width_matrix_b)
264 const auto *matrix_b =
reinterpret_cast<const T *
>(inb.
ptr() +
id.y() * _input->
info()->
strides_in_bytes()[2]);
267 asm volatile(
"PLD [%0, #128*4]" ::
"r"(matrix_b));
268 asm volatile(
"PLD [%0, #128*4]" ::
"r"(matrix_b + in_b_stride));
273 for(; i <= (_k - 4); i += 4)
281 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 1 * in_b_stride));
282 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 2 * in_b_stride));
283 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 3 * in_b_stride));
284 asm volatile(
"PLD [%0, #128*1]" ::
"r"(matrix_b + 4 * in_b_stride));
309 matrix_b += 4 * in_b_stride;
330 matrix_b += in_b_stride;
342 auto vector_sum_col =
reinterpret_cast<int32_t *
>(out.
ptr());
343 if(
id.x() + 16 < width_matrix_b)
352 auto left_over = width_matrix_b -
id.x();
353 for(
auto k = 0; k < 4 && left_over; ++k)
355 for(
auto j = 0; j < 4 && left_over; ++j, --left_over)
357 *(vector_sum_col + k * 4 + j) = sum_col[k][j];
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
INEGEMMLowpReductionKernel()
Constructor.
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
void configure(const ITensor *mtx_b, ITensor *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel's input and output.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
int32_t scalar
Scalar value to multiply each reduced column/row by.
static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixBReducti...
Interface for Neon tensor.
Window calculate_max_window_horizontal(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
1 channel, 1 S32 per channel
typename promote< T >::type promote_t
Get promoted type.
uint8x8_t vpadd(const uint8x8_t &a, const uint8x8_t &b)
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
void configure(const ITensor *mtx_a, ITensor *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override
Initialise the kernel's input and output.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
int32_t k
Number of matrix columns/rows.
Window collapse_if_possible(const Window &full_window, size_t first, size_t last, bool *has_collapsed=nullptr) const
Collapse the dimensions between first and last if possible.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
int16x4_t vreinterpret(const uint16x4_t &a)
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
Create the appropriate Neon vector given its type and size in terms of bits.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
uint8x8_t vgetlow(const uint8x16_t val)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
uint16x8_t vaddl(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
quantized, symmetric fixed-point 8-bit number
quantized, symmetric per channel fixed-point 8-bit number
uint8x8_t vgethigh(const uint8x16_t val)
uint16x8_t vaddw(const uint16x8_t &a, const uint8x8_t &b)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Information about executing thread and CPU.
static constexpr size_t DimZ
Alias for dimension 2 also known as Z dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
void vstore(uint8_t *ptr, uint8x8_t val)
unsigned int num_elems_processed_per_iteration
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
bool is_reshaped
True if the input tensor has been reshaped.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
uint16x4_t vpaddl(const uint8x8_t &a)
T y() const
Alias to access the size of the second dimension.
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Container for valid region of a window.
static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info)
Static function to check if given info will lead to a valid configuration of NEGEMMLowpMatrixAReducti...
Iterator updated by execute_window_loop for each window element.
uint16x8_t vmovl(const uint8x8_t &a)
bool mul_by_scalar
True if each column/row reduction has to be multiplied by a scalar value.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)