Compute Library
 21.11
CpuCastKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
31 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/NEMath.h"
37 #include "support/SaturateCast.h"
38 
39 namespace arm_compute
40 {
41 namespace cpu
42 {
43 namespace kernels
44 {
45 namespace
46 {
47 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
48 {
53  ARM_COMPUTE_UNUSED(policy);
54  ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
61 
62  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
63  && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
64  "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
65 
66  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
67  && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
68  "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
69 
70  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
71  && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
72  "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
73 
74  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
75  "Only data_types supported [in] U16 -> [out] U8, U32");
76 
77  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
78  "Only data_types supported [in] S16 -> [out] U8, S32");
79 
80  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
81  "Only data_types supported [in] BFLOAT16 -> [out] F32");
82 
83  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
84  && dst->data_type() != DataType::U8
85  && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
86  "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
87 
88  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
89  && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
90  && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
91  "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
92 
93  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
94  && dst->data_type() != DataType::F16
95  && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
96  "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
97 
98  // Validate in case of configured dst
99  if(dst->total_size() > 0)
100  {
102  }
103 
104  return Status{};
105 }
106 } // namespace
107 
109 {
111 
112  // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
113  set_shape_if_empty(*dst, src->tensor_shape());
114 
115  _policy = policy;
116 
117  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
118 
119  // Configure kernel window
120  Window win = calculate_max_window(*src, Steps());
121 
122  ICPPKernel::configure(win);
123 }
124 
126 {
127  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
128  return Status{};
129 }
130 
132 {
133  ARM_COMPUTE_UNUSED(info);
136 
137  const auto window_start_x = static_cast<int>(window.x().start());
138  const auto window_end_x = static_cast<int>(window.x().end());
139  const int window_step_x = 16;
140 
141  const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
142  ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST);
143  ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
144  ARM_COMPUTE_ERROR_ON(_src == _dst);
145 
146  ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
147 
148  Window win{ window };
149  win.set(Window::DimX, Window::Dimension(0, 1, 1));
150 
151  Iterator src(_src, win);
152  Iterator dst(_dst, win);
153 
154  switch(_src->info()->data_type())
155  {
157  {
158  switch(_dst->info()->data_type())
159  {
160  case DataType::S16:
161  {
162  /* Up-conversion QASYMM8_SIGNED -> S16 */
163  execute_window_loop(win, [&](const Coordinates &)
164  {
165  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
166  const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
167  int x = window_start_x;
168 
169  for(; x <= (window_end_x - window_step_x); x += window_step_x)
170  {
171  const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
172 
173  const int16x8x2_t texels =
174  {
175  {
176  vmovl_s8(vget_low_s8(texels_s8)),
177  vmovl_s8(vget_high_s8(texels_s8))
178  }
179  };
180 
181  vst1q_s16(dst_ptr + x, texels.val[0]);
182  vst1q_s16(dst_ptr + x + 8, texels.val[1]);
183  }
184 
185  // Compute left-over elements
186  for(; x < window_end_x; ++x)
187  {
188  *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
189  }
190  },
191  src, dst);
192  break;
193  }
194  case DataType::S32:
195  {
196  /* Up-conversion QASYMM8_SIGNED -> S32 */
197  execute_window_loop(win, [&](const Coordinates &)
198  {
199  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
200  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
201  int x = window_start_x;
202 
203  for(; x <= (window_end_x - window_step_x); x += window_step_x)
204  {
205  const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
206 
207  const int16x8x2_t texels =
208  {
209  {
210  vmovl_s8(vget_low_s8(texels_s8)),
211  vmovl_s8(vget_high_s8(texels_s8))
212  }
213  };
214 
215  vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
216  vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
217  vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
218  vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
219  }
220 
221  // Compute left-over elements
222  for(; x < window_end_x; ++x)
223  {
224  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
225  }
226  },
227  src, dst);
228  break;
229  }
230  case DataType::F32:
231  {
232  /* Up-conversion QASYMM8_SIGNED -> F32 */
233  execute_window_loop(win, [&](const Coordinates &)
234  {
235  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
236  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
237 
238  int x = window_start_x;
239  for(; x <= (window_end_x - window_step_x); x += window_step_x)
240  {
241  const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast<int8_t *>(src.ptr()));
242 
243  const int16x8x2_t texels =
244  {
245  {
246  vmovl_s8(vget_low_s8(texels_s8)),
247  vmovl_s8(vget_high_s8(texels_s8))
248  }
249  };
250  vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
251  vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
252  vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
253  vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
254  }
255 
256  // Compute left-over elements
257  for(; x < window_end_x; ++x)
258  {
259  *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
260  }
261  },
262  src, dst);
263  break;
264  }
265 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
266  case DataType::F16:
267  {
268  /* Up-conversion QASYMM8_SIGNED -> F16 */
269  execute_window_loop(win, [&](const Coordinates &)
270  {
271  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
272  const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
273  int x = window_start_x;
274 
275  for(; x <= (window_end_x - window_step_x); x += window_step_x)
276  {
277  const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
278 
279  const int16x8x2_t texels =
280  {
281  {
282  vmovl_s8(vget_low_s8(texels_s8)),
283  vmovl_s8(vget_high_s8(texels_s8))
284  }
285  };
286  vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
287  vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
288  }
289 
290  // Compute left-over elements
291  for(; x < window_end_x; ++x)
292  {
293  *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
294  }
295  },
296  src, dst);
297  break;
298  }
299 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
300 
301  default:
302  ARM_COMPUTE_ERROR("dst data type not supported");
303  }
304  break;
305  }
306 
307  case DataType::QASYMM8:
308  case DataType::U8:
309  {
310  switch(_dst->info()->data_type())
311  {
312  case DataType::S16:
313  {
314  /* Up-conversion U8 -> S16 */
315  execute_window_loop(win, [&](const Coordinates &)
316  {
317  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
318  const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
319 
320  int x = window_start_x;
321  for(; x <= (window_end_x - window_step_x); x += window_step_x)
322  {
323  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
324 
325  const int16x8x2_t texels =
326  {
327  {
328  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
329  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
330  }
331  };
332 
333  vst1q_s16(dst_ptr + x, texels.val[0]);
334  vst1q_s16(dst_ptr + x + 8, texels.val[1]);
335  }
336 
337  // Compute left-over elements
338  for(; x < window_end_x; ++x)
339  {
340  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
341  }
342  },
343  src, dst);
344  break;
345  }
346  case DataType::S32:
347  {
348  /* Up-conversion U8 -> S32 */
349  execute_window_loop(win, [&](const Coordinates &)
350  {
351  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
352  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
353 
354  int x = window_start_x;
355  for(; x <= (window_end_x - window_step_x); x += window_step_x)
356  {
357  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
358 
359  const int16x8x2_t texels =
360  {
361  {
362  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
363  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
364  }
365  };
366 
367  vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
368  vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
369  vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
370  vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
371  }
372 
373  // Compute left-over elements
374  for(; x < window_end_x; ++x)
375  {
376  *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
377  }
378  },
379  src, dst);
380  break;
381  }
382  case DataType::F32:
383  {
384  /* Up-conversion U8 -> F32 */
385  execute_window_loop(win, [&](const Coordinates &)
386  {
387  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
388  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
389 
390  int x = window_start_x;
391  for(; x <= (window_end_x - window_step_x); x += window_step_x)
392  {
393  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
394 
395  const int16x8x2_t texels =
396  {
397  {
398  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
399  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
400  }
401  };
402  vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
403  vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
404  vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
405  vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
406  }
407 
408  // Compute left-over elements
409  for(; x < window_end_x; ++x)
410  {
411  *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
412  }
413  },
414  src, dst);
415  break;
416  }
417 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
418  case DataType::F16:
419  {
420  /* Up-conversion U8 -> F16 */
421  execute_window_loop(win, [&](const Coordinates &)
422  {
423  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
424  const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
425 
426  int x = window_start_x;
427  for(; x <= (window_end_x - window_step_x); x += window_step_x)
428  {
429  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
430 
431  const int16x8x2_t texels =
432  {
433  {
434  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
435  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
436  }
437  };
438  vst1q_f16(dst_ptr + x, vcvtq_f16_s16(texels.val[0]));
439  vst1q_f16(dst_ptr + x + 8, vcvtq_f16_s16(texels.val[1]));
440  }
441 
442  // Compute left-over elements
443  for(; x < window_end_x; ++x)
444  {
445  *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
446  }
447  },
448  src, dst);
449  break;
450  }
451 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
452  case DataType::U16:
453  {
454  /* Up-conversion U8 -> U16 */
455  execute_window_loop(win, [&](const Coordinates &)
456  {
457  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
458  const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
459 
460  int x = window_start_x;
461  for(; x <= (window_end_x - window_step_x); x += window_step_x)
462  {
463  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
464 
465  const uint16x8x2_t texels =
466  {
467  {
468  vmovl_u8(vget_low_u8(texels_u8)),
469  vmovl_u8(vget_high_u8(texels_u8))
470  }
471  };
472 
473  vst1q_u16(dst_ptr + x, texels.val[0]);
474  vst1q_u16(dst_ptr + x + 8, texels.val[1]);
475  }
476 
477  // Compute left-over elements
478  for(; x < window_end_x; ++x)
479  {
480  *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
481  }
482  },
483  src, dst);
484  break;
485  }
486  default:
487  ARM_COMPUTE_ERROR("dst data type not supported");
488  }
489  break;
490  }
491  case DataType::S16:
492  {
493  switch(_dst->info()->data_type())
494  {
496  {
497  /* Down-conversion S16 -> QASYMM8_SIGNED */
498  if(ConvertPolicy::SATURATE == _policy)
499  {
500  execute_window_loop(win, [&](const Coordinates &)
501  {
502  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
503  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
504 
505  int x = window_start_x;
506  for(; x <= (window_end_x - window_step_x); x += window_step_x)
507  {
508  const int16x8x2_t texels =
509  {
510  {
511  vld1q_s16(src_ptr + x),
512  vld1q_s16(src_ptr + x + 8)
513  }
514  };
515 
516  vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
517  }
518 
519  // Compute left-over elements
520  for(; x < window_end_x; ++x)
521  {
522  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
523  }
524  },
525  src, dst);
526  }
527  else
528  {
529  execute_window_loop(win, [&](const Coordinates &)
530  {
531  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
532  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
533 
534  int x = window_start_x;
535  for(; x <= (window_end_x - window_step_x); x += window_step_x)
536  {
537  const int16x8x2_t texels =
538  {
539  {
540  vld1q_s16(src_ptr + x),
541  vld1q_s16(src_ptr + x + 8)
542  }
543  };
544 
545  vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
546  }
547 
548  // Compute left-over elements
549  for(; x < window_end_x; ++x)
550  {
551  *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
552  }
553  },
554  src, dst);
555  }
556  break;
557  }
558  case DataType::U8:
559  {
560  /* Down-conversion S16 -> U8 */
561  if(ConvertPolicy::SATURATE == _policy)
562  {
563  execute_window_loop(win, [&](const Coordinates &)
564  {
565  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
566  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
567 
568  int x = window_start_x;
569  for(; x <= (window_end_x - window_step_x); x += window_step_x)
570  {
571  const int16x8x2_t texels =
572  {
573  {
574  vld1q_s16(src_ptr + x),
575  vld1q_s16(src_ptr + x + 8)
576  }
577  };
578 
579  vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
580  }
581 
582  // Compute left-over elements
583  for(; x < window_end_x; ++x)
584  {
585  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
586  }
587  },
588  src, dst);
589  }
590  else
591  {
592  execute_window_loop(win, [&](const Coordinates &)
593  {
594  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
595  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
596 
597  int x = window_start_x;
598  for(; x <= (window_end_x - window_step_x); x += window_step_x)
599  {
600  const int16x8x2_t texels =
601  {
602  {
603  vld1q_s16(src_ptr + x),
604  vld1q_s16(src_ptr + x + 8)
605  }
606  };
607 
608  vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
609  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
610  }
611 
612  // Compute left-over elements
613  for(; x < window_end_x; ++x)
614  {
615  *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
616  }
617  },
618  src, dst);
619  }
620  break;
621  }
622  case DataType::S32:
623  {
624  /* Up-conversion S16 -> S32 */
625  execute_window_loop(win, [&](const Coordinates &)
626  {
627  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
628  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
629 
630  int x = window_start_x;
631  for(; x <= (window_end_x - window_step_x); x += window_step_x)
632  {
633  const int16x8x2_t texels =
634  {
635  {
636  vld1q_s16(src_ptr + x),
637  vld1q_s16(src_ptr + x + 8)
638  }
639  };
640 
641  const int32x4x4_t texels_s32 =
642  {
643  {
644  vmovl_s16(vget_low_s16(texels.val[0])),
645  vmovl_s16(vget_high_s16(texels.val[0])),
646  vmovl_s16(vget_low_s16(texels.val[1])),
647  vmovl_s16(vget_high_s16(texels.val[1]))
648  }
649  };
650 
651  vst1q_s32(dst_ptr + x, texels_s32.val[0]);
652  vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
653  vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
654  vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
655  }
656 
657  // Compute left-over elements
658  for(; x < window_end_x; ++x)
659  {
660  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
661  }
662  },
663  src, dst);
664  break;
665  }
666  default:
667  ARM_COMPUTE_ERROR("dst data type not supported");
668  }
669  break;
670  }
671  case DataType::U16:
672  {
673  switch(_dst->info()->data_type())
674  {
675  case DataType::U8:
676  {
677  /* Down-conversion U16 -> U8 */
678  if(ConvertPolicy::SATURATE == _policy)
679  {
680  execute_window_loop(win, [&](const Coordinates &)
681  {
682  const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
683  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
684 
685  int x = window_start_x;
686  for(; x <= (window_end_x - window_step_x); x += window_step_x)
687  {
688  const uint16x8x2_t texels =
689  {
690  {
691  vld1q_u16(src_ptr + x),
692  vld1q_u16(src_ptr + x + 8)
693  }
694  };
695 
696  vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
697  }
698 
699  // Compute left-over elements
700  for(; x < window_end_x; ++x)
701  {
702  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
703  }
704  },
705  src, dst);
706  }
707  else
708  {
709  execute_window_loop(win, [&](const Coordinates &)
710  {
711  const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
712  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
713 
714  int x = window_start_x;
715  for(; x <= (window_end_x - window_step_x); x += window_step_x)
716  {
717  const uint16x8x2_t texels =
718  {
719  {
720  vld1q_u16(src_ptr + x),
721  vld1q_u16(src_ptr + x + 8)
722  }
723  };
724 
725  vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
726  }
727 
728  // Compute left-over elements
729  for(; x < window_end_x; ++x)
730  {
731  *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
732  }
733 
734  },
735  src, dst);
736  }
737  break;
738  }
739  case DataType::U32:
740  {
741  /* Up-conversion U16 -> U32 */
742  execute_window_loop(win, [&](const Coordinates &)
743  {
744  const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
745  const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
746 
747  int x = window_start_x;
748  for(; x <= (window_end_x - window_step_x); x += window_step_x)
749  {
750  const uint16x8x2_t texels =
751  {
752  {
753  vld1q_u16(src_ptr + x),
754  vld1q_u16(src_ptr + x + 8)
755  }
756  };
757 
758  vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
759  vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
760  vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
761  vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
762  }
763  // Compute left-over elements
764  for(; x < window_end_x; ++x)
765  {
766  *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
767  }
768 
769  },
770  src, dst);
771  break;
772  }
773  default:
774  ARM_COMPUTE_ERROR("dst data type not supported");
775  }
776  break;
777  }
778 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
779  case DataType::BFLOAT16:
780  switch(_dst->info()->data_type())
781  {
782  case DataType::F32:
783  {
784  /* Up-conversion BFLOAT16 -> F32 */
785  execute_window_loop(win, [&](const Coordinates &)
786  {
787  const auto src_ptr = reinterpret_cast<const bfloat16 *>(src.ptr());
788  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
789 
790  int x = window_start_x;
791  for(; x <= (window_end_x - window_step_x); x += window_step_x)
792  {
793  const uint16x8x2_t texels =
794  {
795  {
796  vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr())),
797  vld1q_u16(reinterpret_cast<uint16_t *>(src.ptr()) + 8)
798  }
799  };
800 
801  vst1q_f32(reinterpret_cast<float *>(dst.ptr()),
802  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[0])), 16)));
803  vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 4,
804  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[0])), 16)));
805  vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 8,
806  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_low_u16(texels.val[1])), 16)));
807  vst1q_f32(reinterpret_cast<float *>(dst.ptr()) + 12,
808  vreinterpretq_f32_u32(vshlq_n_u32(vmovl_u16(vget_high_u16(texels.val[1])), 16)));
809  }
810 
811  for(; x < window_end_x; ++x)
812  {
813  *(dst_ptr + x) = float(*(src_ptr + x));
814  }
815  },
816  src, dst);
817  break;
818  }
819  default:
820  ARM_COMPUTE_ERROR("dst data type unsupported");
821  }
822  break;
823 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
824 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
825  case DataType::F16:
826  switch(_dst->info()->data_type())
827  {
829  {
830  /* Down-conversion F16 -> QASYMM8_SIGNED (Always saturating) */
831  execute_window_loop(win, [&](const Coordinates &)
832  {
833  const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
834  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
835 
836  int x = window_start_x;
837  for(; x <= (window_end_x - window_step_x); x += window_step_x)
838  {
839  const float16x8x2_t texels =
840  {
841  {
842  vld1q_f16(src_ptr + x),
843  vld1q_f16(src_ptr + x + 8),
844  }
845  };
846 
847  vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(vcvtq_s16_f16(texels.val[0])), vqmovn_s16(vcvtq_s16_f16(texels.val[1]))));
848  }
849 
850  // Compute left-over elements
851  for(; x < window_end_x; ++x)
852  {
853  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
854  }
855  },
856  src, dst);
857  break;
858  }
859  case DataType::QASYMM8:
860  case DataType::U8:
861  {
862  /* Down-conversion F16 -> QASYMM8/U8 (Always saturating) */
863  execute_window_loop(win, [&](const Coordinates &)
864  {
865  const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
866  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
867 
868  int x = window_start_x;
869  for(; x <= (window_end_x - window_step_x); x += window_step_x)
870  {
871  const float16x8x2_t texels =
872  {
873  {
874  vld1q_f16(src_ptr + x),
875  vld1q_f16(src_ptr + x + 8),
876  }
877  };
878 
879  vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(vcvtq_s16_f16(texels.val[0])), vqmovun_s16(vcvtq_s16_f16(texels.val[1]))));
880  }
881 
882  // Compute left-over elements
883  for(; x < window_end_x; ++x)
884  {
885  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
886  }
887 
888  },
889  src, dst);
890  break;
891  }
892  case DataType::F32:
893  {
894  /* Up-conversion F16 -> F32 */
895  execute_window_loop(win, [&](const Coordinates &)
896  {
897  const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
898  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
899 
900  int x = window_start_x;
901  for(; x <= (window_end_x - window_step_x); x += window_step_x)
902  {
903  const float16x8x2_t texels =
904  {
905  {
906  vld1q_f16(src_ptr + x),
907  vld1q_f16(src_ptr + x + 8)
908  }
909  };
910  vst1q_f32(dst_ptr + x, vcvt_f32_f16(vget_low_f16(texels.val[0])));
911  vst1q_f32(dst_ptr + x + 4, vcvt_f32_f16(vget_high_f16(texels.val[0])));
912  vst1q_f32(dst_ptr + x + 8, vcvt_f32_f16(vget_low_f16(texels.val[1])));
913  vst1q_f32(dst_ptr + x + 12, vcvt_f32_f16(vget_high_f16(texels.val[1])));
914  }
915 
916  // Compute left-over elements
917  for(; x < window_end_x; ++x)
918  {
919  *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
920  }
921  },
922  src, dst);
923  break;
924  }
925  case DataType::S32:
926  {
927  /* Up-conversion F16 -> S32 */
928  execute_window_loop(win, [&](const Coordinates &)
929  {
930  const auto src_ptr = reinterpret_cast<const float16_t *>(src.ptr());
931  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
932 
933  int x = window_start_x;
934  for(; x <= (window_end_x - window_step_x); x += window_step_x)
935  {
936  const float16x8x2_t texels =
937  {
938  {
939  vld1q_f16(src_ptr + x),
940  vld1q_f16(src_ptr + x + 8)
941  }
942  };
943 
944  vst1q_s32(dst_ptr + x, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[0]))));
945  vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[0]))));
946  vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(vcvt_f32_f16(vget_low_f16(texels.val[1]))));
947  vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(vcvt_f32_f16(vget_high_f16(texels.val[1]))));
948  }
949 
950  // Compute left-over elements
951  for(; x < window_end_x; ++x)
952  {
953  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
954  }
955  },
956  src, dst);
957  break;
958  }
959  default:
960  ARM_COMPUTE_ERROR("dst data type not supported");
961  }
962  break;
963 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
964  case DataType::F32:
965  switch(_dst->info()->data_type())
966  {
967 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
968  case DataType::F16:
969  {
970  /* Down-conversion F32 -> F16 */
971  execute_window_loop(win, [&](const Coordinates &)
972  {
973  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
974  const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
975 
976  int x = window_start_x;
977  for(; x <= (window_end_x - window_step_x); x += window_step_x)
978  {
979  const float32x4x4_t texels =
980  {
981  {
982  vld1q_f32(src_ptr + x),
983  vld1q_f32(src_ptr + x + 4),
984  vld1q_f32(src_ptr + x + 8),
985  vld1q_f32(src_ptr + x + 12)
986  }
987  };
988 
989  vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
990  vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
991  }
992 
993  // Compute left-over elements
994  for(; x < window_end_x; ++x)
995  {
996  *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
997  }
998  },
999  src, dst);
1000  break;
1001  }
1002 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1003 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
1004  case DataType::BFLOAT16:
1005  {
1006  /* Down-conversion F32 -> BFLOAT16 */
1007  execute_window_loop(win, [&](const Coordinates &)
1008  {
1009  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1010  const auto dst_ptr = reinterpret_cast<bfloat16 *>(dst.ptr());
1011 
1012  int x = window_start_x;
1013  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1014  {
1015  wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()),
1016  reinterpret_cast<uint16_t *>(dst.ptr()));
1017  wrapper::vcvt_bf16_f32(reinterpret_cast<float *>(src.ptr()) + 8,
1018  reinterpret_cast<uint16_t *>(dst.ptr()) + 8);
1019  }
1020 
1021  for(; x < window_end_x; ++x)
1022  {
1023  *(dst_ptr + x) = *(src_ptr + x);
1024  }
1025  },
1026  src, dst);
1027  break;
1028  }
1029 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
1030  case DataType::S32:
1031  {
1032  /* Conversion F32 -> S32 */
1033  execute_window_loop(win, [&](const Coordinates &)
1034  {
1035  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1036  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
1037 
1038  int x = window_start_x;
1039  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1040  {
1041  const float32x4x4_t texels =
1042  {
1043  {
1044  vld1q_f32(src_ptr + x),
1045  vld1q_f32(src_ptr + x + 4),
1046  vld1q_f32(src_ptr + x + 8),
1047  vld1q_f32(src_ptr + x + 12),
1048  }
1049  };
1050 
1051  vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
1052  vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
1053  vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
1054  vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
1055  }
1056 
1057  // Compute left-over elements
1058  for(; x < window_end_x; ++x)
1059  {
1060  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
1061  }
1062  },
1063  src, dst);
1064  break;
1065  }
1066  case DataType::QASYMM8:
1067  case DataType::U8:
1068  {
1069  /* Down-conversion F32 -> U8 */
1070  execute_window_loop(win, [&](const Coordinates &)
1071  {
1072  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1073  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1074 
1075  int x = window_start_x;
1076  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1077  {
1078  const float32x4x4_t texels =
1079  {
1080  {
1081  vld1q_f32(src_ptr + x),
1082  vld1q_f32(src_ptr + x + 4),
1083  vld1q_f32(src_ptr + x + 8),
1084  vld1q_f32(src_ptr + x + 12),
1085  }
1086  };
1087 
1088  vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
1089  vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
1090  }
1091 
1092  // Compute left-over elements
1093  for(; x < window_end_x; ++x)
1094  {
1095  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1096  }
1097  },
1098  src, dst);
1099  break;
1100  }
1102  {
1103  /* Down-conversion F32 -> QASYMM8_SIGNED */
1104  execute_window_loop(win, [&](const Coordinates &)
1105  {
1106  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
1107  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
1108 
1109  int x = window_start_x;
1110  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1111  {
1112  const float32x4x4_t texels =
1113  {
1114  {
1115  vld1q_f32(src_ptr + x),
1116  vld1q_f32(src_ptr + x + 4),
1117  vld1q_f32(src_ptr + x + 8),
1118  vld1q_f32(src_ptr + x + 12),
1119  }
1120  };
1121 
1122  vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
1123  vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
1124  }
1125  // Compute left-over elements
1126  for(; x < window_end_x; ++x)
1127  {
1128  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1129  }
1130  },
1131  src, dst);
1132  break;
1133  }
1134 
1135  default:
1136  ARM_COMPUTE_ERROR("dst data type not supported");
1137  }
1138  break;
1139 
1140  case DataType::S32:
1141  switch(_dst->info()->data_type())
1142  {
1143 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
1144  case DataType::F16:
1145  {
1146  /* Down-conversion S32 -> F16 */
1147  execute_window_loop(win, [&](const Coordinates &)
1148  {
1149  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1150  const auto dst_ptr = reinterpret_cast<float16_t *>(dst.ptr());
1151 
1152  int x = window_start_x;
1153  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1154  {
1155  const float32x4x4_t texels =
1156  {
1157  {
1158  vcvtq_f32_s32(vld1q_s32(src_ptr + x)),
1159  vcvtq_f32_s32(vld1q_s32(src_ptr + x + 4)),
1160  vcvtq_f32_s32(vld1q_s32(src_ptr + x + 8)),
1161  vcvtq_f32_s32(vld1q_s32(src_ptr + x + 12))
1162  }
1163  };
1164 
1165  vst1q_f16(dst_ptr + x, vcombine_f16(vcvt_f16_f32(texels.val[0]), vcvt_f16_f32(texels.val[1])));
1166  vst1q_f16(dst_ptr + x + 8, vcombine_f16(vcvt_f16_f32(texels.val[2]), vcvt_f16_f32(texels.val[3])));
1167  }
1168 
1169  // Compute left-over elements
1170  for(; x < window_end_x; ++x)
1171  {
1172  *(dst_ptr + x) = static_cast<float16_t>(*(src_ptr + x));
1173  }
1174  },
1175  src, dst);
1176  break;
1177  }
1178 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
1179  case DataType::F32:
1180  {
1181  /* Conversion S32 -> F32 */
1182  execute_window_loop(win, [&](const Coordinates &)
1183  {
1184  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1185  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
1186 
1187  int x = window_start_x;
1188  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1189  {
1190  const int32x4x4_t texels =
1191  {
1192  {
1193  vld1q_s32(src_ptr + x),
1194  vld1q_s32(src_ptr + x + 4),
1195  vld1q_s32(src_ptr + x + 8),
1196  vld1q_s32(src_ptr + x + 12),
1197  }
1198  };
1199 
1200  vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
1201  vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
1202  vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
1203  vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
1204  }
1205 
1206  // Compute left-over elements
1207  for(; x < window_end_x; ++x)
1208  {
1209  *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
1210  }
1211  },
1212  src, dst);
1213  break;
1214  }
1216  {
1217  /* Down-conversion S32 -> QASYMM8_SIGNED */
1218  if(ConvertPolicy::SATURATE == _policy)
1219  {
1220  execute_window_loop(win, [&](const Coordinates &)
1221  {
1222  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1223  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
1224 
1225  int x = window_start_x;
1226  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1227  {
1228  const int32x4x4_t texels =
1229  {
1230  {
1231  vld1q_s32(src_ptr + x),
1232  vld1q_s32(src_ptr + x + 4),
1233  vld1q_s32(src_ptr + x + 8),
1234  vld1q_s32(src_ptr + x + 12),
1235  }
1236  };
1237  vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
1238  vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
1239  }
1240 
1241  // Compute left-over elements
1242  for(; x < window_end_x; ++x)
1243  {
1244  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
1245  }
1246  },
1247  src, dst);
1248  }
1249  else
1250  {
1251  execute_window_loop(win, [&](const Coordinates &)
1252  {
1253  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1254  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
1255 
1256  int x = window_start_x;
1257  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1258  {
1259  const int32x4x4_t texels =
1260  {
1261  {
1262  vld1q_s32(src_ptr + x),
1263  vld1q_s32(src_ptr + x + 4),
1264  vld1q_s32(src_ptr + x + 8),
1265  vld1q_s32(src_ptr + x + 12)
1266  }
1267  };
1268 
1269  vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1270  vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1271  }
1272 
1273  // Compute left-over elements
1274  for(; x < window_end_x; ++x)
1275  {
1276  *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1277  }
1278  },
1279  src, dst);
1280  }
1281  break;
1282  }
1283  case DataType::QASYMM8:
1284  case DataType::U8:
1285  {
1286  /* Down-conversion S32 -> U8 */
1287  if(ConvertPolicy::SATURATE == _policy)
1288  {
1289  execute_window_loop(win, [&](const Coordinates &)
1290  {
1291  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1292  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1293 
1294  int x = window_start_x;
1295  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1296  {
1297  const int32x4x4_t texels =
1298  {
1299  {
1300  vld1q_s32(src_ptr + x),
1301  vld1q_s32(src_ptr + x + 4),
1302  vld1q_s32(src_ptr + x + 8),
1303  vld1q_s32(src_ptr + x + 12)
1304  }
1305  };
1306  vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1307  vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1308  }
1309 
1310  // Compute left-over elements
1311  for(; x < window_end_x; ++x)
1312  {
1313  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1314  }
1315  },
1316  src, dst);
1317  }
1318  else
1319  {
1320  execute_window_loop(win, [&](const Coordinates &)
1321  {
1322  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1323  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1324 
1325  int x = window_start_x;
1326  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1327  {
1328  const int32x4x4_t texels =
1329  {
1330  {
1331  vld1q_s32(src_ptr + x),
1332  vld1q_s32(src_ptr + x + 4),
1333  vld1q_s32(src_ptr + x + 8),
1334  vld1q_s32(src_ptr + x + 12)
1335  }
1336  };
1337 
1338  vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1339  vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1340  }
1341 
1342  // Compute left-over elements
1343  for(; x < window_end_x; ++x)
1344  {
1345  *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1346  }
1347  },
1348  src, dst);
1349  }
1350  break;
1351  }
1352  default:
1353  ARM_COMPUTE_ERROR("dst data type not supported");
1354  }
1355  break;
1356  default:
1357  ARM_COMPUTE_ERROR("Not supported");
1358  }
1359 }
1360 
1361 const char *CpuCastKernel::name() const
1362 {
1363  return "CpuCastKernel.cpp";
1364 }
1365 } // namespace kernels
1366 } // namespace cpu
1367 } // namespace arm_compute
void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
Set the src and dst of the kernel.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
uint16x8_t vcvtq_f16_s16(float16x8_t)
Definition: clang-tidy.h:118
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
Definition: Validate.h:121
Brain floating point representation class.
Definition: Bfloat16.h:81
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
1 channel, 1 U8 per channel
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
1 channel, 1 U16 per channel
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for CPU tensor.
Definition: ITensor.h:36
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
1 channel, 1 F16 per channel
1 channel, 1 S32 per channel
16-bit brain floating-point number
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
1 channel, 1 S16 per channel
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Information about executing thread and CPU.
Definition: CPPTypes.h:158
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:439
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
int16x8_t vcvtq_s16_f16(float16x8_t)
Definition: clang-tidy.h:63
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle integer overflow.
Definition: Types.h:391
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145