Compute Library
 21.02
NETransposeKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
30 #include "arm_compute/core/Utils.h"
36 
37 #include <arm_neon.h>
38 
39 using namespace arm_compute;
40 
41 namespace arm_compute
42 {
43 class Coordinates;
44 } // namespace arm_compute
45 
46 namespace
47 {
48 TensorShape transposed_tensor_shape(const TensorShape &in)
49 {
51  const size_t w_out = in[1];
52  const size_t h_out = in[0];
53  output_shape.set(0, w_out);
54  output_shape.set(1, h_out);
55 
56  return output_shape;
57 }
58 
60 {
62  //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use Neon FP16 instructions.
64 
65  if(output->total_size() != 0)
66  {
67  const TensorInfo tensor_info = input->clone()->set_tensor_shape(transposed_tensor_shape(input->tensor_shape()));
68 
72  }
73 
74  return Status{};
75 }
76 unsigned int num_elems_processed(size_t element_size)
77 {
78  switch(element_size)
79  {
80  case 1:
81  return 8;
82  case 2:
83  case 4:
84  return 4;
85  default:
86  break;
87  }
88 
89  ARM_COMPUTE_ERROR("Element size not supported");
90 }
91 
92 void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window)
93 {
94  const int window_step_x = 8;
95  const int window_step_y = 8;
96  const int window_start_x = window.x().start();
97  const int window_end_x = window.x().end();
98  const int window_start_y = window.y().start();
99  const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
100  const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
101  const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
102  const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
103 
104  // Check if we need a left-over loop for the y dimension
105  bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
106 
107  Window window_in(window);
108  window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
109  if(left_over_loop_y)
110  {
111  // Check if window_end_y_multiple_of is greater than window_start_y
112  if(window_end_y_multiple_of > window_start_y)
113  {
114  window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
115  }
116  else
117  {
118  window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
119  }
120  }
121 
122  Window window_out(window);
123  window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
124  window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
125 
126  Iterator output(out, window_out);
127 
128  // Run the Neon path if and only if the input is not a row-vector
129  if(in->info()->dimension(1) != 1)
130  {
131  Iterator input(in, window_in);
132  execute_window_loop(window_in, [&](const Coordinates & id)
133  {
134  // Compute 8x8 elements per iteration
135  int x = window_start_x;
136  for(; x <= (window_end_x - window_step_x); x += window_step_x)
137  {
138  const uint8x8_t row0 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 0 * input_stride_in_bytes));
139  const uint8x8_t row1 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 1 * input_stride_in_bytes));
140  const uint8x8_t row2 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 2 * input_stride_in_bytes));
141  const uint8x8_t row3 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 3 * input_stride_in_bytes));
142  const uint8x8_t row4 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 4 * input_stride_in_bytes));
143  const uint8x8_t row5 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 5 * input_stride_in_bytes));
144  const uint8x8_t row6 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 6 * input_stride_in_bytes));
145  const uint8x8_t row7 = vld1_u8(reinterpret_cast<const uint8_t *>(input.ptr() + x + 7 * input_stride_in_bytes));
146 
147  // Transpose 2x2
148  const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1);
149  const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3);
150  const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5);
151  const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7);
152 
153  // Transpose 4x4
154  const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0]));
155  const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1]));
156  const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0]));
157  const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1]));
158 
159  // Transpose 8x8
160  const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0]));
161  const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1]));
162  const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0]));
163  const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1]));
164 
165  // Compute destination address
166  const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
167 
168  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0])));
169  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0])));
170  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0])));
171  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0])));
172  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1])));
173  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1])));
174  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1])));
175  vst1_u8(reinterpret_cast<uint8_t *>(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1])));
176  }
177 
178  // Compute left-over elements along the x dimension (1x8)
179  for(; x < window_end_x; ++x)
180  {
181  const uint8_t val0 = *(input.ptr() + x + 0 * input_stride_in_bytes);
182  const uint8_t val1 = *(input.ptr() + x + 1 * input_stride_in_bytes);
183  const uint8_t val2 = *(input.ptr() + x + 2 * input_stride_in_bytes);
184  const uint8_t val3 = *(input.ptr() + x + 3 * input_stride_in_bytes);
185  const uint8_t val4 = *(input.ptr() + x + 4 * input_stride_in_bytes);
186  const uint8_t val5 = *(input.ptr() + x + 5 * input_stride_in_bytes);
187  const uint8_t val6 = *(input.ptr() + x + 6 * input_stride_in_bytes);
188  const uint8_t val7 = *(input.ptr() + x + 7 * input_stride_in_bytes);
189 
190  uint8x8_t result = vdup_n_u8(0);
191  result = vset_lane_u8(val0, result, 0);
192  result = vset_lane_u8(val1, result, 1);
193  result = vset_lane_u8(val2, result, 2);
194  result = vset_lane_u8(val3, result, 3);
195  result = vset_lane_u8(val4, result, 4);
196  result = vset_lane_u8(val5, result, 5);
197  result = vset_lane_u8(val6, result, 6);
198  result = vset_lane_u8(val7, result, 7);
199 
200  // Compute destination address
201  const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + x * output_stride_in_bytes;
202 
203  vst1_u8(output.ptr() + dst_offset_in_bytes, result);
204  }
205  },
206  input, output);
207  }
208 
209  if(left_over_loop_y)
210  {
211  window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
212  window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
213 
214  Iterator input(in, window_in);
215  Iterator output(out, window_out);
216 
217  // Compute left-over elements along the y dimension (1x1)
218  execute_window_loop(window_in, [&](const Coordinates & id)
219  {
220  const uint8_t val0 = *input.ptr();
221 
222  // Compute destination address
223  const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes;
224 
225  *(output.ptr() + dst_offset_in_bytes) = val0;
226  },
227  input, output);
228  }
229 }
230 
231 void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window)
232 {
233  const int window_step_x = 4;
234  const int window_step_y = 4;
235  const int window_start_x = window.x().start();
236  const int window_end_x = window.x().end();
237  const int window_start_y = window.y().start();
238  const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
239  const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
240  const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
241  const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
242 
243  // Check if we need a left-over loop for the y dimension
244  bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
245 
246  Window window_in(window);
247  window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
248  if(left_over_loop_y)
249  {
250  // Check if window_end_y_multiple_of is greater than window_start_y
251  if(window_end_y_multiple_of > window_start_y)
252  {
253  window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
254  }
255  else
256  {
257  window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
258  }
259  }
260 
261  Window window_out(window);
262  window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
263  window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
264 
265  Iterator output(out, window_out);
266 
267  // Run the Neon path if and only if the input is not a row-vector
268  if(in->info()->dimension(1) != 1)
269  {
270  Iterator input(in, window_in);
271  execute_window_loop(window_in, [&](const Coordinates & id)
272  {
273  // Compute 4x4 elements per iteration
274  int x = window_start_x;
275  for(; x <= (window_end_x - window_step_x); x += window_step_x)
276  {
277  const uint16x4_t row0 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
278  const uint16x4_t row1 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
279  const uint16x4_t row2 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
280  const uint16x4_t row3 = vld1_u16(reinterpret_cast<const uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
281 
282  // Transpose 2x2
283  const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1);
284  const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3);
285 
286  // Transpose 4x4
287  const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0]));
288  const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1]));
289 
290  // Compute destination address
291  const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
292 
293  vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0]));
294  vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0]));
295  vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1]));
296  vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1]));
297  }
298 
299  // Compute left-over elements (1x4)
300  for(; x < window_end_x; ++x)
301  {
302  const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
303  const uint16_t val1 = *(reinterpret_cast<uint16_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
304  const uint16_t val2 = *(reinterpret_cast<uint16_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
305  const uint16_t val3 = *(reinterpret_cast<uint16_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
306 
307  uint16x4_t result = vdup_n_u16(0);
308  result = vset_lane_u16(val0, result, 0);
309  result = vset_lane_u16(val1, result, 1);
310  result = vset_lane_u16(val2, result, 2);
311  result = vset_lane_u16(val3, result, 3);
312 
313  // Compute destination address
314  const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + x * output_stride_in_bytes;
315 
316  vst1_u16(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes), result);
317  }
318  },
319  input, output);
320  }
321 
322  if(left_over_loop_y)
323  {
324  window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
325  window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
326 
327  Iterator input(in, window_in);
328  Iterator output(out, window_out);
329 
330  // Compute left-over elements along the y dimension (1x1)
331  execute_window_loop(window_in, [&](const Coordinates & id)
332  {
333  const uint16_t val0 = *(reinterpret_cast<uint16_t *>(input.ptr()));
334 
335  // Compute destination address
336  const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes;
337 
338  *(reinterpret_cast<uint16_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
339  },
340  input, output);
341  }
342 }
343 
344 void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window)
345 {
346  const int window_step_x = 4;
347  const int window_step_y = 4;
348  const int window_start_x = window.x().start();
349  const int window_end_x = window.x().end();
350  const int window_start_y = window.y().start();
351  const int window_end_y = std::min(window.y().end(), static_cast<int>(in->info()->dimension(1)));
352  const int window_end_y_multiple_of = ((window_end_y - window_start_y) / window_step_y) * window_step_y;
353  const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1];
354  const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1];
355 
356  // Check if we need a left-over loop for the y dimension
357  bool left_over_loop_y = (((window_end_y - window_start_y) % window_step_y) != 0);
358 
359  Window window_in(window);
360  window_in.set(Window::DimX, Window::Dimension(0, 1, 1));
361  if(left_over_loop_y)
362  {
363  // Check if window_end_y_multiple_of is greater than window_start_y
364  if(window_end_y_multiple_of > window_start_y)
365  {
366  window_in.set(Window::DimY, Window::Dimension(window_start_y, window_end_y_multiple_of, window_step_y));
367  }
368  else
369  {
370  window_in.set(Window::DimY, Window::Dimension(0, 0, 1));
371  }
372  }
373 
374  Window window_out(window);
375  window_out.set(Window::DimX, Window::Dimension(0, 0, 0));
376  window_out.set(Window::DimY, Window::Dimension(0, 0, 0));
377 
378  Iterator output(out, window_out);
379 
380  // Run the Neon path if and only if the input is not a row-vector
381  if(in->info()->dimension(1) != 1)
382  {
383  Iterator input(in, window_in);
384  execute_window_loop(window_in, [&](const Coordinates & id)
385  {
386  // Compute 4x4 elements per iteration
387  int x = window_start_x;
388  for(; x <= (window_end_x - window_step_x); x += window_step_x)
389  {
390  const uint32x4_t row0 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
391  const uint32x4_t row1 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
392  const uint32x4_t row2 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
393  const uint32x4_t row3 = vld1q_u32(reinterpret_cast<const uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
394 
395  // Transpose 2x2
396  const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1));
397  const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3));
398  const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1));
399  const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3));
400 
401  // Compute destination address
402  const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
403 
404  // Swap block 01 with block 10 and store
405  vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0]));
406  vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1]));
407  vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0]));
408  vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1]));
409  }
410 
411  // Compute left-over elements (1x4)
412  for(; x < window_end_x; ++x)
413  {
414  const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr() + 0 * input_stride_in_bytes) + x);
415  const uint32_t val1 = *(reinterpret_cast<uint32_t *>(input.ptr() + 1 * input_stride_in_bytes) + x);
416  const uint32_t val2 = *(reinterpret_cast<uint32_t *>(input.ptr() + 2 * input_stride_in_bytes) + x);
417  const uint32_t val3 = *(reinterpret_cast<uint32_t *>(input.ptr() + 3 * input_stride_in_bytes) + x);
418 
419  uint32x4_t result = vdupq_n_u32(0);
420  result = vsetq_lane_u32(val0, result, 0);
421  result = vsetq_lane_u32(val1, result, 1);
422  result = vsetq_lane_u32(val2, result, 2);
423  result = vsetq_lane_u32(val3, result, 3);
424 
425  // Compute destination address
426  const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + x * output_stride_in_bytes;
427 
428  vst1q_u32(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes), result);
429  }
430  },
431  input, output);
432  }
433 
434  if(left_over_loop_y)
435  {
436  window_in.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), 1));
437  window_in.set(Window::DimY, Window::Dimension(window_end_y_multiple_of, window_end_y, 1));
438 
439  Iterator input(in, window_in);
440  Iterator output(out, window_out);
441 
442  // Compute left-over elements along the y dimension (1x1)
443  execute_window_loop(window_in, [&](const Coordinates & id)
444  {
445  const uint32_t val0 = *(reinterpret_cast<uint32_t *>(input.ptr()));
446 
447  // Compute destination address
448  const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes;
449 
450  *(reinterpret_cast<uint32_t *>(output.ptr() + dst_offset_in_bytes)) = val0;
451  },
452  input, output);
453  }
454 }
455 } // namespace
456 
458 {
459  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
461  return Status{};
462 }
463 
465  : _func(nullptr), _input(nullptr), _output(nullptr)
466 {
467 }
468 
469 void NETransposeKernel::configure(const ITensor *input, ITensor *output)
470 {
471  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
472 
473  // Output tensor auto inizialitation if not yet initialized
474  auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(transposed_tensor_shape(input->info()->tensor_shape())));
475 
477 
478  _input = input;
479  _output = output;
480 
481  switch(input->info()->element_size())
482  {
483  case 1:
484  _func = &transpose_8bit_elements;
485  break;
486  case 2:
487  _func = &transpose_16bit_elements;
488  break;
489  case 4:
490  _func = &transpose_32bit_elements;
491  break;
492  default:
493  ARM_COMPUTE_ERROR("Element size not supported");
494  break;
495  }
496 
497  // Configure kernel window
498  Coordinates coord;
499  coord.set_num_dimensions(output->info()->num_dimensions());
500  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
501 
502  // Note: This kernel performs 16 elements per iteration.
503  // However, since we use a left-over for loop on both dimensions (X and Y), we cannot have any read or write out of memory
504  // For this reason num_elems_processed_per_iteration_x is set to 1
505  const unsigned int num_elems_processed_per_iteration_x = 1;
506  const unsigned int num_elems_processed_per_iteration_y = num_elems_processed(input->info()->element_size());
507 
508  // Configure kernel window
509  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y));
510 
511  INEKernel::configure(win);
512 }
513 
514 void NETransposeKernel::run(const Window &window, const ThreadInfo &info)
515 {
516  ARM_COMPUTE_UNUSED(info);
519  ARM_COMPUTE_ERROR_ON(_func == nullptr);
520 
521  (*_func)(_input, _output, window);
522 }
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(...)
Definition: Validate.h:610
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
static Status validate(const ITensorInfo *input, const ITensorInfo *output)
Static function to check if given info will lead to a valid configuration of NETransposeKernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
NETransposeKernel()
Default constructor.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
Definition: Window.h:154
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
virtual const Strides & strides_in_bytes() const =0
The strides in bytes for accessing each dimension of the tensor.
Container for valid region of a window.
Definition: Types.h:188
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
Describe a multidimensional execution window.
Definition: Window.h:39
TensorShape & set(size_t dimension, size_t value, bool apply_dim_correction=true, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
Definition: TensorShape.h:79
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void configure(const ITensor *input, ITensor *output)
Initialise the kernel&#39;s input and output.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145