Compute Library
 21.02
NEDepthConvertLayerKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
31 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/NEMath.h"
37 #include "support/SaturateCast.h"
38 
39 using namespace arm_compute;
40 
41 namespace
42 {
43 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
44 {
49  ARM_COMPUTE_UNUSED(policy);
50  ARM_COMPUTE_RETURN_ERROR_ON(input == output);
57  ARM_COMPUTE_RETURN_ERROR_ON(shift >= 8);
58 
60  && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
61  "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
62 
64  && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
65  "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
66 
68  && output->data_type() != DataType::S32 && output->data_type() != DataType::F16 && output->data_type() != DataType::F32),
69  "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
70 
72  "Only data_types supported [in] U16 -> [out] U8, U32");
73 
75  "Only data_types supported [in] S16 -> [out] U8, S32");
76 
78  "Only data_types supported [in] BFLOAT16 -> [out] F32");
79 
81  && output->data_type() != DataType::U8
82  && output->data_type() != DataType::F32 && output->data_type() != DataType::S32),
83  "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
84 
86  && output->data_type() != DataType::F16 && output->data_type() != DataType::BFLOAT16
87  && output->data_type() != DataType::S32 && output->data_type() != DataType::U8),
88  "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
89 
91  && output->data_type() != DataType::F16
92  && output->data_type() != DataType::F32 && output->data_type() != DataType::U8),
93  "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
94 
95  // Validate in case of configured output
96  if(output->total_size() > 0)
97  {
99  }
100 
101  return Status{};
102 }
103 } // namespace
104 
106  : _input(nullptr), _output(nullptr), _policy(), _shift(0)
107 {
108 }
109 
110 void NEDepthConvertLayerKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift)
111 {
112  ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
113 
114  // Auto initialize output shape if not initialized (We can only auto-configure the shape, datatype must be given)
115  set_shape_if_empty(*output->info(), input->info()->tensor_shape());
116 
117  _input = input;
118  _output = output;
119  _policy = policy;
120  _shift = shift;
121 
122  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), policy, shift));
123 
124  // Configure kernel window
125  Window win = calculate_max_window(*input->info(), Steps());
126  Coordinates coord;
127  coord.set_num_dimensions(output->info()->num_dimensions());
128  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
129 
130  ICPPKernel::configure(win);
131 }
132 
133 Status NEDepthConvertLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift)
134 {
135  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, policy, shift));
136  return Status{};
137 }
138 
140 {
141  ARM_COMPUTE_UNUSED(info);
144  ARM_COMPUTE_ERROR_ON_NULLPTR(_input, _output);
145  ARM_COMPUTE_ERROR_ON(_input == _output);
146 
147  const auto window_start_x = static_cast<int>(window.x().start());
148  const auto window_end_x = static_cast<int>(window.x().end());
149  const int window_step_x = 16;
150 
151  Window win{ window };
152  win.set(Window::DimX, Window::Dimension(0, 1, 1));
153 
154  Iterator input(_input, win);
155  Iterator output(_output, win);
156 
157  switch(_input->info()->data_type())
158  {
160  {
161  const int16x8_t b = vdupq_n_s16(_shift);
162 
163  switch(_output->info()->data_type())
164  {
165  case DataType::S16:
166  {
167  /* Up-conversion QASYMM8_SIGNED -> S16 */
168  execute_window_loop(win, [&](const Coordinates &)
169  {
170  const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
171  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
172  int x = window_start_x;
173 
174  for(; x <= (window_end_x - window_step_x); x += window_step_x)
175  {
176  const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
177 
178  const int16x8x2_t texels =
179  {
180  {
181  vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
182  vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
183  }
184  };
185 
186  vst1q_s16(output_ptr + x, texels.val[0]);
187  vst1q_s16(output_ptr + x + 8, texels.val[1]);
188  }
189 
190  // Compute left-over elements
191  for(; x < window_end_x; ++x)
192  {
193  *(output_ptr + x) = static_cast<int16_t>(*(input_ptr + x) << _shift);
194  }
195  },
196  input, output);
197  break;
198  }
199  case DataType::S32:
200  {
201  /* Up-conversion QASYMM8_SIGNED -> S32 */
202  execute_window_loop(win, [&](const Coordinates &)
203  {
204  const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
205  const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
206  int x = window_start_x;
207 
208  for(; x <= (window_end_x - window_step_x); x += window_step_x)
209  {
210  const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
211 
212  const int16x8x2_t texels =
213  {
214  {
215  vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
216  vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
217  }
218  };
219 
220  vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
221  vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
222  vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
223  vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
224  }
225 
226  // Compute left-over elements
227  for(; x < window_end_x; ++x)
228  {
229  *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
230  }
231  },
232  input, output);
233  break;
234  }
235  case DataType::F32:
236  {
237  /* Up-conversion QASYMM8_SIGNED -> F32 */
238  execute_window_loop(win, [&](const Coordinates &)
239  {
240  const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
241  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
242 
243  int x = window_start_x;
244  for(; x <= (window_end_x - window_step_x); x += window_step_x)
245  {
246  const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(input.ptr()));
247 
248  const int16x8x2_t texels =
249  {
250  {
251  vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
252  vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
253  }
254  };
255  vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
256  vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
257  vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
258  vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
259  }
260 
261  // Compute left-over elements
262  for(; x < window_end_x; ++x)
263  {
264  *(output_ptr + x) = static_cast<float>(*(input_ptr + x) << _shift);
265  }
266  },
267  input, output);
268  break;
269  }
270 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
271  case DataType::F16:
272  {
273  /* Up-conversion QASYMM8_SIGNED -> F16 */
274  execute_window_loop(win, [&](const Coordinates &)
275  {
276  const auto input_ptr = reinterpret_cast<const int8_t *>(input.ptr());
277  const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
278  int x = window_start_x;
279 
280  for(; x <= (window_end_x - window_step_x); x += window_step_x)
281  {
282  const int8x16_t texels_s8 = vld1q_s8(input_ptr + x);
283 
284  const int16x8x2_t texels =
285  {
286  {
287  vshlq_s16(vmovl_s8(vget_low_s8(texels_s8)), b),
288  vshlq_s16(vmovl_s8(vget_high_s8(texels_s8)), b)
289  }
290  };
291  vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
292  vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
293  }
294 
295  // Compute left-over elements
296  for(; x < window_end_x; ++x)
297  {
298  *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
299  }
300  },
301  input, output);
302  break;
303  }
304 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
305 
306  default:
307  ARM_COMPUTE_ERROR("Output data type not supported");
308  }
309  break;
310  }
311 
312  case DataType::QASYMM8:
313  case DataType::U8:
314  {
315  const int16x8_t b = vdupq_n_s16(_shift);
316 
317  switch(_output->info()->data_type())
318  {
319  case DataType::S16:
320  {
321  /* Up-conversion U8 -> S16 */
322  execute_window_loop(win, [&](const Coordinates &)
323  {
324  const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
325  const auto output_ptr = reinterpret_cast<int16_t *>(output.ptr());
326 
327  int x = window_start_x;
328  for(; x <= (window_end_x - window_step_x); x += window_step_x)
329  {
330  const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
331 
332  const int16x8x2_t texels =
333  {
334  {
335  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
336  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
337  }
338  };
339 
340  vst1q_s16(output_ptr + x, texels.val[0]);
341  vst1q_s16(output_ptr + x + 8, texels.val[1]);
342  }
343 
344  // Compute left-over elements
345  for(; x < window_end_x; ++x)
346  {
347  auto in = static_cast<int32_t>(*(input_ptr + x));
348  *(output_ptr + x) = in << _shift;
349  }
350  },
351  input, output);
352  break;
353  }
354  case DataType::S32:
355  {
356  /* Up-conversion U8 -> S32 */
357  execute_window_loop(win, [&](const Coordinates &)
358  {
359  const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
360  const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
361 
362  int x = window_start_x;
363  for(; x <= (window_end_x - window_step_x); x += window_step_x)
364  {
365  const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
366 
367  const int16x8x2_t texels =
368  {
369  {
370  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
371  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
372  }
373  };
374 
375  vst1q_s32(output_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
376  vst1q_s32(output_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
377  vst1q_s32(output_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
378  vst1q_s32(output_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
379  }
380 
381  // Compute left-over elements
382  for(; x < window_end_x; ++x)
383  {
384  auto in = static_cast<uint32_t>(*(input_ptr + x));
385  *(output_ptr + x) = in << _shift;
386  }
387  },
388  input, output);
389  break;
390  }
391  case DataType::F32:
392  {
393  /* Up-conversion U8 -> F32 */
394  execute_window_loop(win, [&](const Coordinates &)
395  {
396  const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
397  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
398 
399  int x = window_start_x;
400  for(; x <= (window_end_x - window_step_x); x += window_step_x)
401  {
402  const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
403 
404  const int16x8x2_t texels =
405  {
406  {
407  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
408  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
409  }
410  };
411  vst1q_f32(output_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
412  vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
413  vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
414  vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
415  }
416 
417  // Compute left-over elements
418  for(; x < window_end_x; ++x)
419  {
420  auto in = static_cast<uint32_t>(*(input_ptr + x));
421  *(output_ptr + x) = static_cast<float>(in << _shift);
422  }
423  },
424  input, output);
425  break;
426  }
427 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
428  case DataType::F16:
429  {
430  /* Up-conversion U8 -> F16 */
431  execute_window_loop(win, [&](const Coordinates &)
432  {
433  const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
434  const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
435 
436  int x = window_start_x;
437  for(; x <= (window_end_x - window_step_x); x += window_step_x)
438  {
439  const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
440 
441  const int16x8x2_t texels =
442  {
443  {
444  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b),
445  vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b)
446  }
447  };
448  vst1q_f16(output_ptr + x, vcvtq_f16_s16(texels.val[0]));
449  vst1q_f16(output_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
450  }
451 
452  // Compute left-over elements
453  for(; x < window_end_x; ++x)
454  {
455  *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) << _shift);
456  }
457  },
458  input, output);
459  break;
460  }
461 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
462  case DataType::U16:
463  {
464  /* Up-conversion U8 -> U16 */
465  execute_window_loop(win, [&](const Coordinates &)
466  {
467  const auto input_ptr = reinterpret_cast<const uint8_t *>(input.ptr());
468  const auto output_ptr = reinterpret_cast<uint16_t *>(output.ptr());
469 
470  int x = window_start_x;
471  for(; x <= (window_end_x - window_step_x); x += window_step_x)
472  {
473  const uint8x16_t texels_u8 = vld1q_u8(input_ptr + x);
474 
475  const uint16x8x2_t texels =
476  {
477  {
478  vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b),
479  vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b)
480  }
481  };
482 
483  vst1q_u16(output_ptr + x, texels.val[0]);
484  vst1q_u16(output_ptr + x + 8, texels.val[1]);
485  }
486 
487  // Compute left-over elements
488  for(; x < window_end_x; ++x)
489  {
490  *(output_ptr + x) = static_cast<uint16_t>(*(input_ptr + x)) << _shift;
491  }
492  },
493  input, output);
494  break;
495  }
496  default:
497  ARM_COMPUTE_ERROR("Output data type not supported");
498  }
499  break;
500  }
501  case DataType::S16:
502  {
503  switch(_output->info()->data_type())
504  {
506  {
507  const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
508 
509  /* Down-conversion S16 -> QASYMM8_SIGNED */
510  if(ConvertPolicy::SATURATE == _policy)
511  {
512  execute_window_loop(win, [&](const Coordinates &)
513  {
514  const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
515  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
516 
517  int x = window_start_x;
518  for(; x <= (window_end_x - window_step_x); x += window_step_x)
519  {
520  const int16x8x2_t texels =
521  {
522  {
523  vqshlq_s16(vld1q_s16(input_ptr + x), b),
524  vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
525  }
526  };
527 
528  vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
529  }
530 
531  // Compute left-over elements
532  for(; x < window_end_x; ++x)
533  {
534  *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
535  }
536  },
537  input, output);
538  }
539  else
540  {
541  execute_window_loop(win, [&](const Coordinates &)
542  {
543  const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
544  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
545 
546  int x = window_start_x;
547  for(; x <= (window_end_x - window_step_x); x += window_step_x)
548  {
549  const int16x8x2_t texels =
550  {
551  {
552  vshlq_s16(vld1q_s16(input_ptr + x), b),
553  vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
554  }
555  };
556 
557  vst1q_s8(output_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
558  }
559 
560  // Compute left-over elements
561  for(; x < window_end_x; ++x)
562  {
563  *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
564  }
565  },
566  input, output);
567  }
568  break;
569  }
570  case DataType::U8:
571  {
572  const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
573 
574  /* Down-conversion S16 -> U8 */
575  if(ConvertPolicy::SATURATE == _policy)
576  {
577  execute_window_loop(win, [&](const Coordinates &)
578  {
579  const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
580  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
581 
582  int x = window_start_x;
583  for(; x <= (window_end_x - window_step_x); x += window_step_x)
584  {
585  const int16x8x2_t texels =
586  {
587  {
588  vqshlq_s16(vld1q_s16(input_ptr + x), b),
589  vqshlq_s16(vld1q_s16(input_ptr + x + 8), b)
590  }
591  };
592 
593  vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
594  }
595 
596  // Compute left-over elements
597  for(; x < window_end_x; ++x)
598  {
599  *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
600  }
601  },
602  input, output);
603  }
604  else
605  {
606  execute_window_loop(win, [&](const Coordinates &)
607  {
608  const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
609  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
610 
611  int x = window_start_x;
612  for(; x <= (window_end_x - window_step_x); x += window_step_x)
613  {
614  const int16x8x2_t texels =
615  {
616  {
617  vshlq_s16(vld1q_s16(input_ptr + x), b),
618  vshlq_s16(vld1q_s16(input_ptr + x + 8), b)
619  }
620  };
621 
622  vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
623  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
624  }
625 
626  // Compute left-over elements
627  for(; x < window_end_x; ++x)
628  {
629  *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
630  }
631  },
632  input, output);
633  }
634  break;
635  }
636  case DataType::S32:
637  {
638  const int32x4_t b = vdupq_n_s32(_shift);
639 
640  /* Up-conversion S16 -> S32 */
641  execute_window_loop(win, [&](const Coordinates &)
642  {
643  const auto input_ptr = reinterpret_cast<const int16_t *>(input.ptr());
644  const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
645 
646  int x = window_start_x;
647  for(; x <= (window_end_x - window_step_x); x += window_step_x)
648  {
649  const int16x8x2_t texels =
650  {
651  {
652  vld1q_s16(input_ptr + x),
653  vld1q_s16(input_ptr + x + 8)
654  }
655  };
656 
657  const int32x4x4_t texels_s32 =
658  {
659  {
660  vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b),
661  vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b),
662  vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b),
663  vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b)
664  }
665  };
666 
667  vst1q_s32(output_ptr + x, texels_s32.val[0]);
668  vst1q_s32(output_ptr + x + 4, texels_s32.val[1]);
669  vst1q_s32(output_ptr + x + 8, texels_s32.val[2]);
670  vst1q_s32(output_ptr + x + 12, texels_s32.val[3]);
671  }
672 
673  // Compute left-over elements
674  for(; x < window_end_x; ++x)
675  {
676  *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) << _shift);
677  }
678  },
679  input, output);
680  break;
681  }
682  default:
683  ARM_COMPUTE_ERROR("Output data type not supported");
684  }
685  break;
686  }
687  case DataType::U16:
688  {
689  switch(_output->info()->data_type())
690  {
691  case DataType::U8:
692  {
693  const int16x8_t b = vdupq_n_s16(-static_cast<int16_t>(_shift));
694 
695  /* Down-conversion U16 -> U8 */
696  if(ConvertPolicy::SATURATE == _policy)
697  {
698  execute_window_loop(win, [&](const Coordinates &)
699  {
700  const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
701  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
702 
703  int x = window_start_x;
704  for(; x <= (window_end_x - window_step_x); x += window_step_x)
705  {
706  const uint16x8x2_t texels =
707  {
708  {
709  vqshlq_u16(vld1q_u16(input_ptr + x), b),
710  vqshlq_u16(vld1q_u16(input_ptr + x + 8), b)
711  }
712  };
713 
714  vst1q_u8(output_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
715  }
716 
717  // Compute left-over elements
718  for(; x < window_end_x; ++x)
719  {
720  *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
721  }
722  },
723  input, output);
724  }
725  else
726  {
727  execute_window_loop(win, [&](const Coordinates &)
728  {
729  const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
730  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
731 
732  int x = window_start_x;
733  for(; x <= (window_end_x - window_step_x); x += window_step_x)
734  {
735  const uint16x8x2_t texels =
736  {
737  {
738  vshlq_u16(vld1q_u16(input_ptr + x), b),
739  vshlq_u16(vld1q_u16(input_ptr + x + 8), b)
740  }
741  };
742 
743  vst1q_u8(output_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
744  }
745 
746  // Compute left-over elements
747  for(; x < window_end_x; ++x)
748  {
749  *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
750  }
751 
752  },
753  input, output);
754  }
755  break;
756  }
757  case DataType::U32:
758  {
759  const int32x4_t b = vdupq_n_s32(_shift);
760 
761  /* Up-conversion U16 -> U32 */
762  execute_window_loop(win, [&](const Coordinates &)
763  {
764  const auto input_ptr = reinterpret_cast<const uint16_t *>(input.ptr());
765  const auto output_ptr = reinterpret_cast<uint32_t *>(output.ptr());
766 
767  int x = window_start_x;
768  for(; x <= (window_end_x - window_step_x); x += window_step_x)
769  {
770  const uint16x8x2_t texels =
771  {
772  {
773  vld1q_u16(input_ptr + x),
774  vld1q_u16(input_ptr + x + 8)
775  }
776  };
777 
778  vst1q_u32(output_ptr + x, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b));
779  vst1q_u32(output_ptr + x + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b));
780  vst1q_u32(output_ptr + x + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b));
781  vst1q_u32(output_ptr + x + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b));
782  }
783  // Compute left-over elements
784  for(; x < window_end_x; ++x)
785  {
786  *(output_ptr + x) = static_cast<uint32_t>(*(input_ptr + x) << _shift);
787  }
788 
789  },
790  input, output);
791  break;
792  }
793  default:
794  ARM_COMPUTE_ERROR("Output data type not supported");
795  }
796  break;
797  }
798 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
799  case DataType::BFLOAT16:
800  switch(_output->info()->data_type())
801  {
802  case DataType::F32:
803  {
804  /* Up-conversion BFLOAT16 -> F32 */
805  execute_window_loop(win, [&](const Coordinates &)
806  {
807  const auto input_ptr = reinterpret_cast<const bfloat16 *>(input.ptr());
808  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
809 
810  int x = window_start_x;
811  for(; x <= (window_end_x - window_step_x); x += window_step_x)
812  {
813  const uint16x8x2_t texels =
814  {
815  {
816  vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr())),
817  vld1q_u16(reinterpret_cast<uint16_t *>(input.ptr()) + 8)
818  }
819  };
820 
821  vst1q_f32(reinterpret_cast<float *>(output.ptr()),
822  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
823  vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 4,
824  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
825  vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 8,
826  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
827  vst1q_f32(reinterpret_cast<float *>(output.ptr()) + 12,
828  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
829  }
830 
831  for(; x < window_end_x; ++x)
832  {
833  *(output_ptr + x) = float(*(input_ptr + x));
834  }
835  },
836  input, output);
837  break;
838  }
839  default:
840  ARM_COMPUTE_ERROR("Output data type unsupported");
841  }
842  break;
843 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
844 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
845  case DataType::F16:
846  switch(_output->info()->data_type())
847  {
849  {
850  const float16_t scale_s = 1 << _shift;
851  const float16x8_t scale = vdupq_n_f16(scale_s);
852 
853  /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
854  execute_window_loop(win, [&](const Coordinates &)
855  {
856  const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
857  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
858 
859  int x = window_start_x;
860  for(; x <= (window_end_x - window_step_x); x += window_step_x)
861  {
862  const float16x8x2_t texels =
863  {
864  {
865  vmulq_f16(vld1q_f16(input_ptr + x), scale),
866  vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
867  }
868  };
869 
870  vst1q_s8(output_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
871  }
872 
873  // Compute left-over elements
874  for(; x < window_end_x; ++x)
875  {
876  *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
877  }
878  },
879  input, output);
880  break;
881  }
882  case DataType::QASYMM8:
883  case DataType::U8:
884  {
885  const float16_t scale_s = 1 << _shift;
886  const float16x8_t scale = vdupq_n_f16(scale_s);
887 
888  /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
889  execute_window_loop(win, [&](const Coordinates &)
890  {
891  const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
892  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
893 
894  int x = window_start_x;
895  for(; x <= (window_end_x - window_step_x); x += window_step_x)
896  {
897  const float16x8x2_t texels =
898  {
899  {
900  vmulq_f16(vld1q_f16(input_ptr + x), scale),
901  vmulq_f16(vld1q_f16(input_ptr + x + 8), scale),
902  }
903  };
904 
905  vst1q_u8(output_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
906  }
907 
908  // Compute left-over elements
909  for(; x < window_end_x; ++x)
910  {
911  *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
912  }
913 
914  },
915  input, output);
916  break;
917  }
918  case DataType::F32:
919  {
920  const float scale_s = 1 << _shift;
921  const float32x4_t scale = vdupq_n_f32(scale_s);
922 
923  /* Up-conversion F16 -> F32 */
924  execute_window_loop(win, [&](const Coordinates &)
925  {
926  const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
927  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
928 
929  int x = window_start_x;
930  for(; x <= (window_end_x - window_step_x); x += window_step_x)
931  {
932  const float16x8x2_t texels =
933  {
934  {
935  vld1q_f16(input_ptr + x),
936  vld1q_f16(input_ptr + x + 8)
937  }
938  };
939  vst1q_f32(output_ptr + x, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale));
940  vst1q_f32(output_ptr + x + 4, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale));
941  vst1q_f32(output_ptr + x + 8, vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale));
942  vst1q_f32(output_ptr + x + 12, vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale));
943  }
944 
945  // Compute left-over elements
946  for(; x < window_end_x; ++x)
947  {
948  *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
949  }
950  },
951  input, output);
952  break;
953  }
954  case DataType::S32:
955  {
956  const float scale_s = 1 << _shift;
957  const float32x4_t scale = vdupq_n_f32(scale_s);
958 
959  /* Up-conversion F16 -> S32 */
960  execute_window_loop(win, [&](const Coordinates &)
961  {
962  const auto input_ptr = reinterpret_cast<const float16_t *>(input.ptr());
963  const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
964 
965  int x = window_start_x;
966  for(; x <= (window_end_x - window_step_x); x += window_step_x)
967  {
968  const float16x8x2_t texels =
969  {
970  {
971  vld1q_f16(input_ptr + x),
972  vld1q_f16(input_ptr + x + 8)
973  }
974  };
975 
976  vst1q_s32(output_ptr + x, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[0])), scale)));
977  vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[0])), scale)));
978  vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_low_f16(texels.val[1])), scale)));
979  vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(vmulq_f32(vcvt_f32_f16(vget_high_f16(texels.val[1])), scale)));
980  }
981 
982  // Compute left-over elements
983  for(; x < window_end_x; ++x)
984  {
985  *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
986  }
987  },
988  input, output);
989  break;
990  }
991  default:
992  ARM_COMPUTE_ERROR("Output data type not supported");
993  }
994  break;
995 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
996  case DataType::F32:
997  switch(_output->info()->data_type())
998  {
999 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1000  case DataType::F16:
1001  {
1002  const float scale_s = 1.f / (1 << _shift);
1003  const float32x4_t scale = vdupq_n_f32(scale_s);
1004 
1005  /* Down-conversion F32 -> F16 */
1006  execute_window_loop(win, [&](const Coordinates &)
1007  {
1008  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1009  const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1010 
1011  int x = window_start_x;
1012  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1013  {
1014  const float32x4x4_t texels =
1015  {
1016  {
1017  vmulq_f32(vld1q_f32(input_ptr + x), scale),
1018  vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1019  vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1020  vmulq_f32(vld1q_f32(input_ptr + x + 12), scale)
1021  }
1022  };
1023 
1024  vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1025  vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1026  }
1027 
1028  // Compute left-over elements
1029  for(; x < window_end_x; ++x)
1030  {
1031  *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1032  }
1033  },
1034  input, output);
1035  break;
1036  }
1037 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1038 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
1039  case DataType::BFLOAT16:
1040  {
1041  /* Down-conversion F32 -> BFLOAT16 */
1042  execute_window_loop(win, [&](const Coordinates &)
1043  {
1044  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1045  const auto output_ptr = reinterpret_cast<bfloat16 *>(output.ptr());
1046 
1047  int x = window_start_x;
1048  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1049  {
1050  wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()),
1051  reinterpret_cast<uint16_t *>(output.ptr()));
1052  wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(input.ptr()) + 8,
1053  reinterpret_cast<uint16_t *>(output.ptr()) + 8);
1054  }
1055 
1056  for(; x < window_end_x; ++x)
1057  {
1058  *(output_ptr + x) = *(input_ptr + x);
1059  }
1060  },
1061  input, output);
1062  break;
1063  }
1064 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
1065  case DataType::S32:
1066  {
1067  const float scale_s = 1.f / (1 << _shift);
1068  const float32x4_t scale = vdupq_n_f32(scale_s);
1069 
1070  /* Conversion F32 -> S32 */
1071  execute_window_loop(win, [&](const Coordinates &)
1072  {
1073  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1074  const auto output_ptr = reinterpret_cast<int32_t *>(output.ptr());
1075 
1076  int x = window_start_x;
1077  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1078  {
1079  const float32x4x4_t texels =
1080  {
1081  {
1082  vmulq_f32(vld1q_f32(input_ptr + x), scale),
1083  vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1084  vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1085  vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1086  }
1087  };
1088 
1089  vst1q_s32(output_ptr + x, vcvtq_s32_f32(texels.val[0]));
1090  vst1q_s32(output_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1091  vst1q_s32(output_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1092  vst1q_s32(output_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1093  }
1094 
1095  // Compute left-over elements
1096  for(; x < window_end_x; ++x)
1097  {
1098  *(output_ptr + x) = static_cast<int32_t>(*(input_ptr + x) * scale_s);
1099  }
1100  },
1101  input, output);
1102  break;
1103  }
1104  case DataType::QASYMM8:
1105  case DataType::U8:
1106  {
1107  const float scale_s = 1.f / (1 << _shift);
1108  const float32x4_t scale = vdupq_n_f32(scale_s);
1109 
1110  /* Down-conversion F32 -> U8 */
1111  execute_window_loop(win, [&](const Coordinates &)
1112  {
1113  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1114  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1115 
1116  int x = window_start_x;
1117  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1118  {
1119  const float32x4x4_t texels =
1120  {
1121  {
1122  vmulq_f32(vld1q_f32(input_ptr + x), scale),
1123  vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1124  vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1125  vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1126  }
1127  };
1128 
1129  vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1130  vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1131  }
1132 
1133  // Compute left-over elements
1134  for(; x < window_end_x; ++x)
1135  {
1136  *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) * scale_s);
1137  }
1138  },
1139  input, output);
1140  break;
1141  }
1143  {
1144  const float scale_s = 1.f / (1 << _shift);
1145  const float32x4_t scale = vdupq_n_f32(scale_s);
1146 
1147  /* Down-conversion F32 -> QASYMM8_SIGNED */
1148  execute_window_loop(win, [&](const Coordinates &)
1149  {
1150  const auto input_ptr = reinterpret_cast<const float *>(input.ptr());
1151  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1152 
1153  int x = window_start_x;
1154  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1155  {
1156  const float32x4x4_t texels =
1157  {
1158  {
1159  vmulq_f32(vld1q_f32(input_ptr + x), scale),
1160  vmulq_f32(vld1q_f32(input_ptr + x + 4), scale),
1161  vmulq_f32(vld1q_f32(input_ptr + x + 8), scale),
1162  vmulq_f32(vld1q_f32(input_ptr + x + 12), scale),
1163  }
1164  };
1165 
1166  vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1167  vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1168  }
1169  // Compute left-over elements
1170  for(; x < window_end_x; ++x)
1171  {
1172  *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) * scale_s);
1173  }
1174  },
1175  input, output);
1176  break;
1177  }
1178 
1179  default:
1180  ARM_COMPUTE_ERROR("Output data type not supported");
1181  }
1182  break;
1183 
1184  case DataType::S32:
1185  switch(_output->info()->data_type())
1186  {
1187 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1188  case DataType::F16:
1189  {
1190  const float scale_s = 1.f / (1 << _shift);
1191  const float32x4_t scale = vdupq_n_f32(scale_s);
1192 
1193  /* Down-conversion S32 -> F16 */
1194  execute_window_loop(win, [&](const Coordinates &)
1195  {
1196  const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1197  const auto output_ptr = reinterpret_cast<float16_t *>(output.ptr());
1198 
1199  int x = window_start_x;
1200  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1201  {
1202  const float32x4x4_t texels =
1203  {
1204  {
1205  vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x)), scale),
1206  vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 4)), scale),
1207  vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 8)), scale),
1208  vmulq_f32(vcvtq_f32_s32(vld1q_s32(input_ptr + x + 12)), scale)
1209  }
1210  };
1211 
1212  vst1q_f16(output_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1213  vst1q_f16(output_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1214  }
1215 
1216  // Compute left-over elements
1217  for(; x < window_end_x; ++x)
1218  {
1219  *(output_ptr + x) = static_cast<float16_t>(*(input_ptr + x) * scale_s);
1220  }
1221  },
1222  input, output);
1223  break;
1224  }
1225 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1226  case DataType::F32:
1227  {
1228  const int scale_s = 1.f / (1 << _shift);
1229  const int32x4_t scale = vdupq_n_s32(scale_s);
1230 
1231  /* Conversion S32 -> F32 */
1232  execute_window_loop(win, [&](const Coordinates &)
1233  {
1234  const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1235  const auto output_ptr = reinterpret_cast<float *>(output.ptr());
1236 
1237  int x = window_start_x;
1238  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1239  {
1240  const int32x4x4_t texels =
1241  {
1242  {
1243  vmulq_s32(vld1q_s32(input_ptr + x), scale),
1244  vmulq_s32(vld1q_s32(input_ptr + x + 4), scale),
1245  vmulq_s32(vld1q_s32(input_ptr + x + 8), scale),
1246  vmulq_s32(vld1q_s32(input_ptr + x + 12), scale),
1247  }
1248  };
1249 
1250  vst1q_f32(output_ptr + x, vcvtq_f32_s32(texels.val[0]));
1251  vst1q_f32(output_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1252  vst1q_f32(output_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1253  vst1q_f32(output_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1254  }
1255 
1256  // Compute left-over elements
1257  for(; x < window_end_x; ++x)
1258  {
1259  *(output_ptr + x) = static_cast<float>(*(input_ptr + x) * scale_s);
1260  }
1261  },
1262  input, output);
1263  break;
1264  }
1266  {
1267  const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1268 
1269  /* Down-conversion S32 -> QASYMM8_SIGNED */
1270  if(ConvertPolicy::SATURATE == _policy)
1271  {
1272  execute_window_loop(win, [&](const Coordinates &)
1273  {
1274  const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1275  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1276 
1277  int x = window_start_x;
1278  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1279  {
1280  const int32x4x4_t texels =
1281  {
1282  {
1283  vqshlq_s32(vld1q_s32(input_ptr + x), b),
1284  vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1285  vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1286  vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1287  }
1288  };
1289  vst1_s8(output_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1290  vst1_s8(output_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1291  }
1292 
1293  // Compute left-over elements
1294  for(; x < window_end_x; ++x)
1295  {
1296  *(output_ptr + x) = utils::cast::saturate_cast<int8_t>(*(input_ptr + x) >> _shift);
1297  }
1298  },
1299  input, output);
1300  }
1301  else
1302  {
1303  execute_window_loop(win, [&](const Coordinates &)
1304  {
1305  const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1306  const auto output_ptr = reinterpret_cast<int8_t *>(output.ptr());
1307 
1308  int x = window_start_x;
1309  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1310  {
1311  const int32x4x4_t texels =
1312  {
1313  {
1314  vshlq_s32(vld1q_s32(input_ptr + x), b),
1315  vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1316  vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1317  vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1318  }
1319  };
1320 
1321  vst1_s8(output_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1322  vst1_s8(output_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1323  }
1324 
1325  // Compute left-over elements
1326  for(; x < window_end_x; ++x)
1327  {
1328  *(output_ptr + x) = static_cast<int8_t>(*(input_ptr + x) >> _shift);
1329  }
1330  },
1331  input, output);
1332  }
1333  break;
1334  }
1335  case DataType::QASYMM8:
1336  case DataType::U8:
1337  {
1338  const int32x4_t b = vdupq_n_s32(-static_cast<int32_t>(_shift));
1339 
1340  /* Down-conversion S32 -> U8 */
1341  if(ConvertPolicy::SATURATE == _policy)
1342  {
1343  execute_window_loop(win, [&](const Coordinates &)
1344  {
1345  const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1346  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1347 
1348  int x = window_start_x;
1349  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1350  {
1351  const int32x4x4_t texels =
1352  {
1353  {
1354  vqshlq_s32(vld1q_s32(input_ptr + x), b),
1355  vqshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1356  vqshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1357  vqshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1358  }
1359  };
1360  vst1_u8(output_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1361  vst1_u8(output_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1362  }
1363 
1364  // Compute left-over elements
1365  for(; x < window_end_x; ++x)
1366  {
1367  *(output_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(input_ptr + x) >> _shift);
1368  }
1369  },
1370  input, output);
1371  }
1372  else
1373  {
1374  execute_window_loop(win, [&](const Coordinates &)
1375  {
1376  const auto input_ptr = reinterpret_cast<const int32_t *>(input.ptr());
1377  const auto output_ptr = reinterpret_cast<uint8_t *>(output.ptr());
1378 
1379  int x = window_start_x;
1380  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1381  {
1382  const int32x4x4_t texels =
1383  {
1384  {
1385  vshlq_s32(vld1q_s32(input_ptr + x), b),
1386  vshlq_s32(vld1q_s32(input_ptr + x + 4), b),
1387  vshlq_s32(vld1q_s32(input_ptr + x + 8), b),
1388  vshlq_s32(vld1q_s32(input_ptr + x + 12), b)
1389  }
1390  };
1391 
1392  vst1_u8(output_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1393  vst1_u8(output_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1394  }
1395 
1396  // Compute left-over elements
1397  for(; x < window_end_x; ++x)
1398  {
1399  *(output_ptr + x) = static_cast<uint8_t>(*(input_ptr + x) >> _shift);
1400  }
1401  },
1402  input, output);
1403  }
1404  break;
1405  }
1406  default:
1407  ARM_COMPUTE_ERROR("Output data type not supported");
1408  }
1409  break;
1410  default:
1411  ARM_COMPUTE_ERROR("Not supported");
1412  }
1413 }
static Status validate(const ITensorInfo *input, const ITensorInfo *output, ConvertPolicy policy, uint32_t shift=0)
Static function to check if given info will lead to a valid configuration of NEDepthConvertLayerKerne...
void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift=0)
Set the input and output of the kernel.
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
uint16x8_t vcvtq_f16_s16(float16x8_t)
Definition: clang-tidy.h:118
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
float16x8_t vmulq_f16(float16x8_t, float16x8_t)
Definition: clang-tidy.h:78
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
Definition: Validate.h:114
SimpleTensor< float > b
Definition: DFT.cpp:157
Brain floating point representation class.
Definition: Bfloat16.h:80
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
1 channel, 1 U8 per channel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
1 channel, 1 U16 per channel
Status class.
Definition: Error.h:52
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
16-bit brain floating-point number
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
int16x8_t vcvtq_s16_f16(float16x8_t)
Definition: clang-tidy.h:63
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
Container for valid region of a window.
Definition: Types.h:188
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle overflow.
Definition: Types.h:385
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145