51 const size_t w_out = in[1];
52 const size_t h_out = in[0];
76 unsigned int num_elems_processed(
size_t element_size)
94 const int window_step_x = 8;
95 const int window_step_y = 8;
96 const int window_start_x = window.
x().
start();
97 const int window_end_x = window.
x().
end();
98 const int window_start_y = window.
y().
start();
99 const int window_end_y = std::min(window.
y().
end(),
static_cast<int>(in->
info()->
dimension(1)));
100 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
105 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
112 if(window_end_y_multiple_of > window_start_y)
122 Window window_out(window);
135 int x = window_start_x;
136 for(; x <= (window_end_x - window_step_x); x += window_step_x)
138 const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
139 const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
140 const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
141 const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
142 const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
143 const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
144 const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
145 const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
148 const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
149 const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
150 const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
151 const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
154 const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
155 const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
156 const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
157 const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
160 const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
161 const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
162 const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
163 const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
166 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint8_t) + x * output_stride_in_bytes;
168 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
169 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
170 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
171 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
172 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
173 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
174 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
175 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
179 for(; x < window_end_x; ++x)
181 const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
182 const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
183 const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
184 const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
185 const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
186 const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
187 const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
188 const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
190 uint8x8_t result = vdup_n_u8(0);
191 result = vset_lane_u8(val0, result, 0);
192 result = vset_lane_u8(val1, result, 1);
193 result = vset_lane_u8(val2, result, 2);
194 result = vset_lane_u8(val3, result, 3);
195 result = vset_lane_u8(val4, result, 4);
196 result = vset_lane_u8(val5, result, 5);
197 result = vset_lane_u8(val6, result, 6);
198 result = vset_lane_u8(val7, result, 7);
201 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint8_t) + x * output_stride_in_bytes;
203 vst1_u8(output.ptr() + dst_offset_in_bytes, result);
220 const uint8_t val0 = *input.ptr();
223 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint8_t) +
id.x() * output_stride_in_bytes;
225 *(output.ptr() + dst_offset_in_bytes) = val0;
233 const int window_step_x = 4;
234 const int window_step_y = 4;
235 const int window_start_x = window.
x().
start();
236 const int window_end_x = window.
x().
end();
237 const int window_start_y = window.
y().
start();
238 const int window_end_y = std::min(window.
y().
end(),
static_cast<int>(in->
info()->
dimension(1)));
239 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
244 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
251 if(window_end_y_multiple_of > window_start_y)
261 Window window_out(window);
274 int x = window_start_x;
275 for(; x <= (window_end_x - window_step_x); x += window_step_x)
277 const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
278 const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
279 const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
280 const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
283 const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
284 const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
287 const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
288 const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
291 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint16_t) + x * output_stride_in_bytes;
293 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
294 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
295 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
296 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
300 for(; x < window_end_x; ++x)
302 const uint16_t val0 = *(
reinterpret_cast<uint16_t *
>(input.ptr() + 0 * input_stride_in_bytes) + x);
303 const uint16_t val1 = *(
reinterpret_cast<uint16_t *
>(input.ptr() + 1 * input_stride_in_bytes) + x);
304 const uint16_t val2 = *(
reinterpret_cast<uint16_t *
>(input.ptr() + 2 * input_stride_in_bytes) + x);
305 const uint16_t val3 = *(
reinterpret_cast<uint16_t *
>(input.ptr() + 3 * input_stride_in_bytes) + x);
307 uint16x4_t result = vdup_n_u16(0);
308 result = vset_lane_u16(val0, result, 0);
309 result = vset_lane_u16(val1, result, 1);
310 result = vset_lane_u16(val2, result, 2);
311 result = vset_lane_u16(val3, result, 3);
314 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint16_t) + x * output_stride_in_bytes;
316 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
333 const uint16_t val0 = *(
reinterpret_cast<uint16_t *
>(input.ptr()));
336 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint16_t) +
id.x() * output_stride_in_bytes;
338 *(
reinterpret_cast<uint16_t *
>(output.ptr() + dst_offset_in_bytes)) = val0;
346 const int window_step_x = 4;
347 const int window_step_y = 4;
348 const int window_start_x = window.
x().
start();
349 const int window_end_x = window.
x().
end();
350 const int window_start_y = window.
y().
start();
351 const int window_end_y = std::min(window.
y().
end(),
static_cast<int>(in->
info()->
dimension(1)));
352 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
357 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
364 if(window_end_y_multiple_of > window_start_y)
374 Window window_out(window);
387 int x = window_start_x;
388 for(; x <= (window_end_x - window_step_x); x += window_step_x)
390 const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
391 const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
392 const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
393 const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
396 const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
397 const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
398 const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
399 const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
402 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint32_t) + x * output_stride_in_bytes;
405 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
406 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
407 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
408 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
412 for(; x < window_end_x; ++x)
414 const uint32_t val0 = *(
reinterpret_cast<uint32_t *
>(input.ptr() + 0 * input_stride_in_bytes) + x);
415 const uint32_t val1 = *(
reinterpret_cast<uint32_t *
>(input.ptr() + 1 * input_stride_in_bytes) + x);
416 const uint32_t val2 = *(
reinterpret_cast<uint32_t *
>(input.ptr() + 2 * input_stride_in_bytes) + x);
417 const uint32_t val3 = *(
reinterpret_cast<uint32_t *
>(input.ptr() + 3 * input_stride_in_bytes) + x);
419 uint32x4_t result = vdupq_n_u32(0);
420 result = vsetq_lane_u32(val0, result, 0);
421 result = vsetq_lane_u32(val1, result, 1);
422 result = vsetq_lane_u32(val2, result, 2);
423 result = vsetq_lane_u32(val3, result, 3);
426 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint32_t) + x * output_stride_in_bytes;
428 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
445 const uint32_t val0 = *(
reinterpret_cast<uint32_t *
>(input.ptr()));
448 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint32_t) +
id.x() * output_stride_in_bytes;
450 *(
reinterpret_cast<uint32_t *
>(output.ptr() + dst_offset_in_bytes)) = val0;
465 : _func(nullptr), _input(nullptr), _output(nullptr)
484 _func = &transpose_8bit_elements;
487 _func = &transpose_16bit_elements;
490 _func = &transpose_32bit_elements;
505 const unsigned int num_elems_processed_per_iteration_x = 1;
506 const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(input->
info()->
element_size());
511 INEKernel::configure(win);
521 (*_func)(_input, _output,
window);
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NETransposeKernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
NETransposeKernel()
Default constructor.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Store the tensor's metadata.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Container for valid region of a window.
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
void configure(const ITensor *input, ITensor *output)
Initialise the kernel's input and output.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.