46 unsigned int num_elems_processed(
size_t element_size)
62 void transpose_8bit_elements(
const ITensor *in, ITensor *out,
const Window &window)
64 const int window_step_x = 8;
65 const int window_step_y = 8;
66 const int window_start_x = window.x().start();
67 const int window_end_x = window.x().end();
68 const int window_start_y = window.y().start();
69 const int window_end_y = std::min(window.y().end(),
static_cast<int>(in->info()->dimension(1)));
70 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
71 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
72 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
75 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
77 Window window_in(window);
82 if(window_end_y_multiple_of > window_start_y)
84 window_in.set(
Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
92 Window window_out(window);
93 window_out.set(
Window::DimX, Window::Dimension(0, 0, 0));
94 window_out.set(
Window::DimY, Window::Dimension(0, 0, 0));
96 Iterator output(out, window_out);
99 if(in->info()->dimension(1) != 1)
101 Iterator
input(in, window_in);
105 int x = window_start_x;
106 for(; x <= (window_end_x - window_step_x); x += window_step_x)
108 const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 0 * input_stride_in_bytes));
109 const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 1 * input_stride_in_bytes));
110 const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 2 * input_stride_in_bytes));
111 const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 3 * input_stride_in_bytes));
112 const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 4 * input_stride_in_bytes));
113 const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 5 * input_stride_in_bytes));
114 const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 6 * input_stride_in_bytes));
115 const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(
input.ptr() + x + 7 * input_stride_in_bytes));
118 const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
119 const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
120 const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
121 const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
124 const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
125 const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
126 const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
127 const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
130 const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
131 const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
132 const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
133 const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
136 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint8_t) + x * output_stride_in_bytes;
138 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
139 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
140 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
141 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
142 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
143 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
144 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
145 vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
149 for(; x < window_end_x; ++x)
151 const uint8_t val0 = *(
input.ptr() + x + 0 * input_stride_in_bytes);
152 const uint8_t val1 = *(
input.ptr() + x + 1 * input_stride_in_bytes);
153 const uint8_t val2 = *(
input.ptr() + x + 2 * input_stride_in_bytes);
154 const uint8_t val3 = *(
input.ptr() + x + 3 * input_stride_in_bytes);
155 const uint8_t val4 = *(
input.ptr() + x + 4 * input_stride_in_bytes);
156 const uint8_t val5 = *(
input.ptr() + x + 5 * input_stride_in_bytes);
157 const uint8_t val6 = *(
input.ptr() + x + 6 * input_stride_in_bytes);
158 const uint8_t val7 = *(
input.ptr() + x + 7 * input_stride_in_bytes);
160 uint8x8_t result = vdup_n_u8(0);
161 result = vset_lane_u8(val0, result, 0);
162 result = vset_lane_u8(val1, result, 1);
163 result = vset_lane_u8(val2, result, 2);
164 result = vset_lane_u8(val3, result, 3);
165 result = vset_lane_u8(val4, result, 4);
166 result = vset_lane_u8(val5, result, 5);
167 result = vset_lane_u8(val6, result, 6);
168 result = vset_lane_u8(val7, result, 7);
171 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint8_t) + x * output_stride_in_bytes;
173 vst1_u8(output.ptr() + dst_offset_in_bytes, result);
181 window_in.set(
Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
182 window_in.set(
Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
184 Iterator
input(in, window_in);
185 Iterator output(out, window_out);
190 const uint8_t val0 = *
input.ptr();
193 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint8_t) +
id.x() * output_stride_in_bytes;
195 *(output.ptr() + dst_offset_in_bytes) = val0;
201 void transpose_16bit_elements(
const ITensor *in, ITensor *out,
const Window &window)
203 const int window_step_x = 4;
204 const int window_step_y = 4;
205 const int window_start_x = window.x().start();
206 const int window_end_x = window.x().end();
207 const int window_start_y = window.y().start();
208 const int window_end_y = std::min(window.y().end(),
static_cast<int>(in->info()->dimension(1)));
209 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
210 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
211 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
214 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
216 Window window_in(window);
217 window_in.set(
Window::DimX, Window::Dimension(0, 1, 1));
221 if(window_end_y_multiple_of > window_start_y)
223 window_in.set(
Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
227 window_in.set(
Window::DimY, Window::Dimension(0, 0, 1));
231 Window window_out(window);
232 window_out.set(
Window::DimX, Window::Dimension(0, 0, 0));
233 window_out.set(
Window::DimY, Window::Dimension(0, 0, 0));
235 Iterator output(out, window_out);
238 if(in->info()->dimension(1) != 1)
240 Iterator
input(in, window_in);
244 int x = window_start_x;
245 for(; x <= (window_end_x - window_step_x); x += window_step_x)
247 const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(
input.ptr() + 0 * input_stride_in_bytes) + x);
248 const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(
input.ptr() + 1 * input_stride_in_bytes) + x);
249 const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(
input.ptr() + 2 * input_stride_in_bytes) + x);
250 const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(
input.ptr() + 3 * input_stride_in_bytes) + x);
253 const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
254 const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
257 const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
258 const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
261 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint16_t) + x * output_stride_in_bytes;
263 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
264 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
265 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
266 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
270 for(; x < window_end_x; ++x)
272 const uint16_t val0 = *(
reinterpret_cast<uint16_t *
>(
input.ptr() + 0 * input_stride_in_bytes) + x);
273 const uint16_t val1 = *(
reinterpret_cast<uint16_t *
>(
input.ptr() + 1 * input_stride_in_bytes) + x);
274 const uint16_t val2 = *(
reinterpret_cast<uint16_t *
>(
input.ptr() + 2 * input_stride_in_bytes) + x);
275 const uint16_t val3 = *(
reinterpret_cast<uint16_t *
>(
input.ptr() + 3 * input_stride_in_bytes) + x);
277 uint16x4_t result = vdup_n_u16(0);
278 result = vset_lane_u16(val0, result, 0);
279 result = vset_lane_u16(val1, result, 1);
280 result = vset_lane_u16(val2, result, 2);
281 result = vset_lane_u16(val3, result, 3);
284 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint16_t) + x * output_stride_in_bytes;
286 vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
294 window_in.set(
Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
295 window_in.set(
Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
297 Iterator
input(in, window_in);
298 Iterator output(out, window_out);
303 const uint16_t val0 = *(
reinterpret_cast<uint16_t *
>(
input.ptr()));
306 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint16_t) +
id.x() * output_stride_in_bytes;
308 *(
reinterpret_cast<uint16_t *
>(output.ptr() + dst_offset_in_bytes)) = val0;
314 void transpose_32bit_elements(
const ITensor *in, ITensor *out,
const Window &window)
316 const int window_step_x = 4;
317 const int window_step_y = 4;
318 const int window_start_x = window.x().start();
319 const int window_end_x = window.x().end();
320 const int window_start_y = window.y().start();
321 const int window_end_y = std::min(window.y().end(),
static_cast<int>(in->info()->dimension(1)));
322 const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
323 const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
324 const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
327 bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
329 Window window_in(window);
330 window_in.set(
Window::DimX, Window::Dimension(0, 1, 1));
334 if(window_end_y_multiple_of > window_start_y)
336 window_in.set(
Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
340 window_in.set(
Window::DimY, Window::Dimension(0, 0, 1));
344 Window window_out(window);
345 window_out.set(
Window::DimX, Window::Dimension(0, 0, 0));
346 window_out.set(
Window::DimY, Window::Dimension(0, 0, 0));
348 Iterator output(out, window_out);
351 if(in->info()->dimension(1) != 1)
353 Iterator
input(in, window_in);
357 int x = window_start_x;
358 for(; x <= (window_end_x - window_step_x); x += window_step_x)
360 const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(
input.ptr() + 0 * input_stride_in_bytes) + x);
361 const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(
input.ptr() + 1 * input_stride_in_bytes) + x);
362 const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(
input.ptr() + 2 * input_stride_in_bytes) + x);
363 const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(
input.ptr() + 3 * input_stride_in_bytes) + x);
366 const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
367 const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
368 const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
369 const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
372 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint32_t) + x * output_stride_in_bytes;
375 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
376 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
377 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
378 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
382 for(; x < window_end_x; ++x)
384 const uint32_t val0 = *(
reinterpret_cast<uint32_t *
>(
input.ptr() + 0 * input_stride_in_bytes) + x);
385 const uint32_t val1 = *(
reinterpret_cast<uint32_t *
>(
input.ptr() + 1 * input_stride_in_bytes) + x);
386 const uint32_t val2 = *(
reinterpret_cast<uint32_t *
>(
input.ptr() + 2 * input_stride_in_bytes) + x);
387 const uint32_t val3 = *(
reinterpret_cast<uint32_t *
>(
input.ptr() + 3 * input_stride_in_bytes) + x);
389 uint32x4_t result = vdupq_n_u32(0);
390 result = vsetq_lane_u32(val0, result, 0);
391 result = vsetq_lane_u32(val1, result, 1);
392 result = vsetq_lane_u32(val2, result, 2);
393 result = vsetq_lane_u32(val3, result, 3);
396 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint32_t) + x * output_stride_in_bytes;
398 vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
406 window_in.set(
Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
407 window_in.set(
Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
409 Iterator
input(in, window_in);
410 Iterator output(out, window_out);
415 const uint32_t val0 = *(
reinterpret_cast<uint32_t *
>(
input.ptr()));
418 const size_t dst_offset_in_bytes =
id.y() *
sizeof(uint32_t) +
id.x() * output_stride_in_bytes;
420 *(
reinterpret_cast<uint32_t *
>(output.ptr() + dst_offset_in_bytes)) = val0;
441 const unsigned int num_elems_processed_per_iteration_x = 1;
442 const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(src->
element_size());
452 ICpuKernel::configure(win);
463 "Element size not supported");
487 switch(
src->info()->element_size())
490 transpose_8bit_elements(
src,
dst, window);
493 transpose_16bit_elements(
src,
dst, window);
496 transpose_32bit_elements(
src,
dst, window);
506 return "CpuTransposeKernel";
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(...)
SimpleTensor< float > src
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
TensorShape compute_transposed_shape(const ITensorInfo &input)
Calculate the transposed shape of a tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
static Status validate(const ITensorInfo *src, const ITensorInfo *dst)
Static function to check if given info will lead to a valid configuration.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
const char * name() const override
Name of the kernel.
Class to describe a number of elements in each dimension.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Container for valid region of a window.
Describe a multidimensional execution window.
void configure(const ITensorInfo *src, ITensorInfo *dst)
Configure kernel for a given list of arguments.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)