Compute Library
 22.11
CpuCastKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
31 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/NEMath.h"
38 #include "support/SaturateCast.h"
39 
41 
42 namespace arm_compute
43 {
44 namespace cpu
45 {
46 namespace kernels
47 {
48 namespace
49 {
50 static const std::vector<CpuCastKernel::CastKernel> available_kernels =
51 {
52  {
53  "neon_qs8_cast",
54  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8_SIGNED && data.dst_dt == DataType::F16 && data.isa.fp16; },
56  },
57  {
58  "neon_qu8_cast",
59  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::QASYMM8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
61  },
62  {
63  "neon_u8_cast",
64  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::U8 && data.dst_dt == DataType::F16 && data.isa.fp16; },
66  },
67  {
68  "neon_fp16_cast",
69  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F16 && data.isa.fp16; },
71  },
72  {
73  "neon_fp32_to_fp16_cast",
74  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
76  },
77  {
78  "neon_fp32_to_bf16_cast",
79  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::F32 && data.dst_dt == DataType::BFLOAT16 && data.isa.bf16; },
81  },
82  {
83  "neon_s32_cast",
84  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::S32 && data.dst_dt == DataType::F16 && data.isa.fp16; },
86  },
87  {
88  "neon_bf16_cast",
89  [](const CastDataTypeISASelectorData & data) { return data.src_dt == DataType::BFLOAT16 && data.dst_dt == DataType::F32 && data.isa.bf16; },
91  },
92 };
93 
94 Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
95 {
100  ARM_COMPUTE_UNUSED(policy);
101  ARM_COMPUTE_RETURN_ERROR_ON(src == dst);
108 
109  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8_SIGNED && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::S32
110  && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
111  "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
112 
113  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::QASYMM8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
114  && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
115  "Only data_types supported [in] QASYMM8 -> [out] U16, S16, S32, F16, F32");
116 
117  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U8 && (dst->data_type() != DataType::S16 && dst->data_type() != DataType::U16
118  && dst->data_type() != DataType::S32 && dst->data_type() != DataType::F16 && dst->data_type() != DataType::F32),
119  "Only data_types supported [in] U8 -> [out] U16, S16, S32, F16, F32");
120 
121  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::U16 && (dst->data_type() != DataType::U8 && dst->data_type() != DataType::U32),
122  "Only data_types supported [in] U16 -> [out] U8, U32");
123 
124  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::U8 && dst->data_type() != DataType::S32),
125  "Only data_types supported [in] S16 -> [out] U8, S32");
126 
127  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::BFLOAT16 && dst->data_type() != DataType::F32,
128  "Only data_types supported [in] BFLOAT16 -> [out] F32");
129 
130  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F16 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
131  && dst->data_type() != DataType::U8
132  && dst->data_type() != DataType::F32 && dst->data_type() != DataType::S32),
133  "Only data_types supported [in] F16 -> [out] QASYMM8, F32, S32, U8");
134 
135  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::F32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
136  && dst->data_type() != DataType::F16 && dst->data_type() != DataType::BFLOAT16
137  && dst->data_type() != DataType::S32 && dst->data_type() != DataType::U8),
138  "Only data_types supported [in] F32 -> [out] QASYMM8, BFLOAT16, F16, S32, U8");
139 
140  ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == DataType::S32 && (dst->data_type() != DataType::QASYMM8_SIGNED && dst->data_type() != DataType::QASYMM8
141  && dst->data_type() != DataType::F16
142  && dst->data_type() != DataType::F32 && dst->data_type() != DataType::U8),
143  "Only data_types supported [in] S32 -> [out] QASYMM8, F16, F32, U8");
144 
145  // Validate in case of configured dst
146  if(dst->total_size() > 0)
147  {
149  }
150 
151  return Status{};
152 }
153 } // namespace
154 
156 {
158 
159  // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
160  set_shape_if_empty(*dst, src->tensor_shape());
161 
162  _policy = policy;
163 
165 
166  // Configure kernel window
167  Window win = calculate_max_window(*src, Steps());
168 
169  ICPPKernel::configure(win);
170 }
171 
173 {
175  return Status{};
176 }
177 
179 {
180  ARM_COMPUTE_UNUSED(info);
183 
184  const auto window_start_x = static_cast<int>(window.x().start());
185  const auto window_end_x = static_cast<int>(window.x().end());
186  const int window_step_x = 16;
187 
188  const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
189  ITensor *_dst = tensors.get_tensor(TensorType::ACL_DST);
190  ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
191  ARM_COMPUTE_ERROR_ON(_src == _dst);
192 
193  ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
194 
195  Window win{ window };
196  win.set(Window::DimX, Window::Dimension(0, 1, 1));
197 
198  Iterator src(_src, win);
199  Iterator dst(_dst, win);
200 
201  /*ukernel runs only when using fp16/bfloat16, so we validate it isn't a nullptr only before using it */
203 
204  switch(_src->info()->data_type())
205  {
207  {
208  switch(_dst->info()->data_type())
209  {
210  case DataType::S16:
211  {
212  /* Up-conversion QASYMM8_SIGNED -> S16 */
213  execute_window_loop(win, [&](const Coordinates &)
214  {
215  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
216  const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
217  int x = window_start_x;
218 
219  for(; x <= (window_end_x - window_step_x); x += window_step_x)
220  {
221  const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
222 
223  const int16x8x2_t texels =
224  {
225  {
226  vmovl_s8(vget_low_s8(texels_s8)),
227  vmovl_s8(vget_high_s8(texels_s8))
228  }
229  };
230 
231  vst1q_s16(dst_ptr + x, texels.val[0]);
232  vst1q_s16(dst_ptr + x + 8, texels.val[1]);
233  }
234 
235  // Compute left-over elements
236  for(; x < window_end_x; ++x)
237  {
238  *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
239  }
240  },
241  src, dst);
242  break;
243  }
244  case DataType::S32:
245  {
246  /* Up-conversion QASYMM8_SIGNED -> S32 */
247  execute_window_loop(win, [&](const Coordinates &)
248  {
249  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
250  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
251  int x = window_start_x;
252 
253  for(; x <= (window_end_x - window_step_x); x += window_step_x)
254  {
255  const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
256 
257  const int16x8x2_t texels =
258  {
259  {
260  vmovl_s8(vget_low_s8(texels_s8)),
261  vmovl_s8(vget_high_s8(texels_s8))
262  }
263  };
264 
265  vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
266  vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
267  vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
268  vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
269  }
270 
271  // Compute left-over elements
272  for(; x < window_end_x; ++x)
273  {
274  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
275  }
276  },
277  src, dst);
278  break;
279  }
280  case DataType::F32:
281  {
282  /* Up-conversion QASYMM8_SIGNED -> F32 */
283  execute_window_loop(win, [&](const Coordinates &)
284  {
285  const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
286  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
287 
288  int x = window_start_x;
289  for(; x <= (window_end_x - window_step_x); x += window_step_x)
290  {
291  const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
292 
293  const int16x8x2_t texels =
294  {
295  {
296  vmovl_s8(vget_low_s8(texels_s8)),
297  vmovl_s8(vget_high_s8(texels_s8))
298  }
299  };
300  vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
301  vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
302  vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
303  vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
304  }
305 
306  // Compute left-over elements
307  for(; x < window_end_x; ++x)
308  {
309  *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
310  }
311  },
312  src, dst);
313  break;
314  }
315  case DataType::F16:
316  {
317  /* Up-conversion QASYMM8_SIGNED -> F16 */
318  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
319  uk->ukernel(_src, _dst, info, _policy, window);
320  break;
321  }
322  default:
323  ARM_COMPUTE_ERROR("dst data type not supported");
324  }
325  break;
326  }
327 
328  case DataType::QASYMM8:
329  case DataType::U8:
330  {
331  switch(_dst->info()->data_type())
332  {
333  case DataType::S16:
334  {
335  /* Up-conversion U8 -> S16 */
336  execute_window_loop(win, [&](const Coordinates &)
337  {
338  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
339  const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
340 
341  int x = window_start_x;
342  for(; x <= (window_end_x - window_step_x); x += window_step_x)
343  {
344  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
345 
346  const int16x8x2_t texels =
347  {
348  {
349  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
350  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
351  }
352  };
353 
354  vst1q_s16(dst_ptr + x, texels.val[0]);
355  vst1q_s16(dst_ptr + x + 8, texels.val[1]);
356  }
357 
358  // Compute left-over elements
359  for(; x < window_end_x; ++x)
360  {
361  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
362  }
363  },
364  src, dst);
365  break;
366  }
367  case DataType::S32:
368  {
369  /* Up-conversion U8 -> S32 */
370  execute_window_loop(win, [&](const Coordinates &)
371  {
372  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
373  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
374 
375  int x = window_start_x;
376  for(; x <= (window_end_x - window_step_x); x += window_step_x)
377  {
378  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
379 
380  const int16x8x2_t texels =
381  {
382  {
383  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
384  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
385  }
386  };
387 
388  vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
389  vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
390  vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
391  vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
392  }
393 
394  // Compute left-over elements
395  for(; x < window_end_x; ++x)
396  {
397  *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
398  }
399  },
400  src, dst);
401  break;
402  }
403  case DataType::F32:
404  {
405  /* Up-conversion U8 -> F32 */
406  execute_window_loop(win, [&](const Coordinates &)
407  {
408  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
409  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
410 
411  int x = window_start_x;
412  for(; x <= (window_end_x - window_step_x); x += window_step_x)
413  {
414  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
415 
416  const int16x8x2_t texels =
417  {
418  {
419  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
420  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))
421  }
422  };
423  vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
424  vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
425  vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
426  vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
427  }
428 
429  // Compute left-over elements
430  for(; x < window_end_x; ++x)
431  {
432  *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
433  }
434  },
435  src, dst);
436  break;
437  }
438  case DataType::F16:
439  {
440  /* Up-conversion U8 -> FP16 */
441  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
442  uk->ukernel(_src, _dst, info, _policy, window);
443  break;
444  }
445  case DataType::U16:
446  {
447  /* Up-conversion U8 -> U16 */
448  execute_window_loop(win, [&](const Coordinates &)
449  {
450  const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
451  const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
452 
453  int x = window_start_x;
454  for(; x <= (window_end_x - window_step_x); x += window_step_x)
455  {
456  const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
457 
458  const uint16x8x2_t texels =
459  {
460  {
461  vmovl_u8(vget_low_u8(texels_u8)),
462  vmovl_u8(vget_high_u8(texels_u8))
463  }
464  };
465 
466  vst1q_u16(dst_ptr + x, texels.val[0]);
467  vst1q_u16(dst_ptr + x + 8, texels.val[1]);
468  }
469 
470  // Compute left-over elements
471  for(; x < window_end_x; ++x)
472  {
473  *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
474  }
475  },
476  src, dst);
477  break;
478  }
479  default:
480  ARM_COMPUTE_ERROR("dst data type not supported");
481  }
482  break;
483  }
484  case DataType::S16:
485  {
486  switch(_dst->info()->data_type())
487  {
489  {
490  /* Down-conversion S16 -> QASYMM8_SIGNED */
491  if(ConvertPolicy::SATURATE == _policy)
492  {
493  execute_window_loop(win, [&](const Coordinates &)
494  {
495  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
496  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
497 
498  int x = window_start_x;
499  for(; x <= (window_end_x - window_step_x); x += window_step_x)
500  {
501  const int16x8x2_t texels =
502  {
503  {
504  vld1q_s16(src_ptr + x),
505  vld1q_s16(src_ptr + x + 8)
506  }
507  };
508 
509  vst1q_s8(dst_ptr + x, vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
510  }
511 
512  // Compute left-over elements
513  for(; x < window_end_x; ++x)
514  {
515  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
516  }
517  },
518  src, dst);
519  }
520  else
521  {
522  execute_window_loop(win, [&](const Coordinates &)
523  {
524  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
525  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
526 
527  int x = window_start_x;
528  for(; x <= (window_end_x - window_step_x); x += window_step_x)
529  {
530  const int16x8x2_t texels =
531  {
532  {
533  vld1q_s16(src_ptr + x),
534  vld1q_s16(src_ptr + x + 8)
535  }
536  };
537 
538  vst1q_s8(dst_ptr + x, vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
539  }
540 
541  // Compute left-over elements
542  for(; x < window_end_x; ++x)
543  {
544  *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
545  }
546  },
547  src, dst);
548  }
549  break;
550  }
551  case DataType::U8:
552  {
553  /* Down-conversion S16 -> U8 */
554  if(ConvertPolicy::SATURATE == _policy)
555  {
556  execute_window_loop(win, [&](const Coordinates &)
557  {
558  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
559  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
560 
561  int x = window_start_x;
562  for(; x <= (window_end_x - window_step_x); x += window_step_x)
563  {
564  const int16x8x2_t texels =
565  {
566  {
567  vld1q_s16(src_ptr + x),
568  vld1q_s16(src_ptr + x + 8)
569  }
570  };
571 
572  vst1q_u8(dst_ptr + x, vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
573  }
574 
575  // Compute left-over elements
576  for(; x < window_end_x; ++x)
577  {
578  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
579  }
580  },
581  src, dst);
582  }
583  else
584  {
585  execute_window_loop(win, [&](const Coordinates &)
586  {
587  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
588  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
589 
590  int x = window_start_x;
591  for(; x <= (window_end_x - window_step_x); x += window_step_x)
592  {
593  const int16x8x2_t texels =
594  {
595  {
596  vld1q_s16(src_ptr + x),
597  vld1q_s16(src_ptr + x + 8)
598  }
599  };
600 
601  vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
602  vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
603  }
604 
605  // Compute left-over elements
606  for(; x < window_end_x; ++x)
607  {
608  *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
609  }
610  },
611  src, dst);
612  }
613  break;
614  }
615  case DataType::S32:
616  {
617  /* Up-conversion S16 -> S32 */
618  execute_window_loop(win, [&](const Coordinates &)
619  {
620  const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
621  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
622 
623  int x = window_start_x;
624  for(; x <= (window_end_x - window_step_x); x += window_step_x)
625  {
626  const int16x8x2_t texels =
627  {
628  {
629  vld1q_s16(src_ptr + x),
630  vld1q_s16(src_ptr + x + 8)
631  }
632  };
633 
634  const int32x4x4_t texels_s32 =
635  {
636  {
637  vmovl_s16(vget_low_s16(texels.val[0])),
638  vmovl_s16(vget_high_s16(texels.val[0])),
639  vmovl_s16(vget_low_s16(texels.val[1])),
640  vmovl_s16(vget_high_s16(texels.val[1]))
641  }
642  };
643 
644  vst1q_s32(dst_ptr + x, texels_s32.val[0]);
645  vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
646  vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
647  vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
648  }
649 
650  // Compute left-over elements
651  for(; x < window_end_x; ++x)
652  {
653  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
654  }
655  },
656  src, dst);
657  break;
658  }
659  default:
660  ARM_COMPUTE_ERROR("dst data type not supported");
661  }
662  break;
663  }
664 
665  case DataType::U16:
666  {
667  switch(_dst->info()->data_type())
668  {
669  case DataType::U8:
670  {
671  /* Down-conversion U16 -> U8 */
672  if(ConvertPolicy::SATURATE == _policy)
673  {
674  execute_window_loop(win, [&](const Coordinates &)
675  {
676  const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
677  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
678 
679  int x = window_start_x;
680  for(; x <= (window_end_x - window_step_x); x += window_step_x)
681  {
682  const uint16x8x2_t texels =
683  {
684  {
685  vld1q_u16(src_ptr + x),
686  vld1q_u16(src_ptr + x + 8)
687  }
688  };
689 
690  vst1q_u8(dst_ptr + x, vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
691  }
692 
693  // Compute left-over elements
694  for(; x < window_end_x; ++x)
695  {
696  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
697  }
698  },
699  src, dst);
700  }
701  else
702  {
703  execute_window_loop(win, [&](const Coordinates &)
704  {
705  const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
706  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
707 
708  int x = window_start_x;
709  for(; x <= (window_end_x - window_step_x); x += window_step_x)
710  {
711  const uint16x8x2_t texels =
712  {
713  {
714  vld1q_u16(src_ptr + x),
715  vld1q_u16(src_ptr + x + 8)
716  }
717  };
718 
719  vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
720  }
721 
722  // Compute left-over elements
723  for(; x < window_end_x; ++x)
724  {
725  *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
726  }
727 
728  },
729  src, dst);
730  }
731  break;
732  }
733  case DataType::U32:
734  {
735  /* Up-conversion U16 -> U32 */
736  execute_window_loop(win, [&](const Coordinates &)
737  {
738  const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
739  const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
740 
741  int x = window_start_x;
742  for(; x <= (window_end_x - window_step_x); x += window_step_x)
743  {
744  const uint16x8x2_t texels =
745  {
746  {
747  vld1q_u16(src_ptr + x),
748  vld1q_u16(src_ptr + x + 8)
749  }
750  };
751 
752  vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
753  vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
754  vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
755  vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
756  }
757  // Compute left-over elements
758  for(; x < window_end_x; ++x)
759  {
760  *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
761  }
762 
763  },
764  src, dst);
765  break;
766  }
767  default:
768  ARM_COMPUTE_ERROR("dst data type not supported");
769  }
770  break;
771  }
772  case DataType::BFLOAT16:
773  {
774  /* Up-conversion BFLOAT16 -> F32 */
775  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
776  uk->ukernel(_src, _dst, info, _policy, window);
777  break;
778  }
779  case DataType::F16:
780  {
781  /* conversion F16 -> any data type */
782  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
783  uk->ukernel(_src, _dst, info, _policy, window);
784  break;
785  }
786  case DataType::F32:
787  switch(_dst->info()->data_type())
788  {
789  case DataType::F16:
790  {
791  /* Down-conversion F32 -> F16 */
792  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
793  uk->ukernel(_src, _dst, info, _policy, window);
794  break;
795  }
796  case DataType::BFLOAT16:
797  {
798  /* Down-conversion F32 -> BFLOAT16 */
799  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
800  uk->ukernel(_src, _dst, info, _policy, window);
801  break;
802  }
803  case DataType::S32:
804  {
805  /* Conversion F32 -> S32 */
806  execute_window_loop(win, [&](const Coordinates &)
807  {
808  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
809  const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
810 
811  int x = window_start_x;
812  for(; x <= (window_end_x - window_step_x); x += window_step_x)
813  {
814  const float32x4x4_t texels =
815  {
816  {
817  vld1q_f32(src_ptr + x),
818  vld1q_f32(src_ptr + x + 4),
819  vld1q_f32(src_ptr + x + 8),
820  vld1q_f32(src_ptr + x + 12),
821  }
822  };
823 
824  vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
825  vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
826  vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
827  vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
828  }
829 
830  // Compute left-over elements
831  for(; x < window_end_x; ++x)
832  {
833  *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
834  }
835  },
836  src, dst);
837  break;
838  }
839  case DataType::QASYMM8:
840  case DataType::U8:
841  {
842  /* Down-conversion F32 -> U8 */
843  execute_window_loop(win, [&](const Coordinates &)
844  {
845  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
846  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
847 
848  int x = window_start_x;
849  for(; x <= (window_end_x - window_step_x); x += window_step_x)
850  {
851  const float32x4x4_t texels =
852  {
853  {
854  vld1q_f32(src_ptr + x),
855  vld1q_f32(src_ptr + x + 4),
856  vld1q_f32(src_ptr + x + 8),
857  vld1q_f32(src_ptr + x + 12),
858  }
859  };
860 
861  vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])), vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
862  vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])), vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
863  }
864 
865  // Compute left-over elements
866  for(; x < window_end_x; ++x)
867  {
868  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
869  }
870  },
871  src, dst);
872  break;
873  }
875  {
876  /* Down-conversion F32 -> QASYMM8_SIGNED */
877  execute_window_loop(win, [&](const Coordinates &)
878  {
879  const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
880  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
881 
882  int x = window_start_x;
883  for(; x <= (window_end_x - window_step_x); x += window_step_x)
884  {
885  const float32x4x4_t texels =
886  {
887  {
888  vld1q_f32(src_ptr + x),
889  vld1q_f32(src_ptr + x + 4),
890  vld1q_f32(src_ptr + x + 8),
891  vld1q_f32(src_ptr + x + 12),
892  }
893  };
894 
895  vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])), vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
896  vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])), vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
897  }
898  // Compute left-over elements
899  for(; x < window_end_x; ++x)
900  {
901  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
902  }
903  },
904  src, dst);
905  break;
906  }
907 
908  default:
909  ARM_COMPUTE_ERROR("dst data type not supported");
910  }
911  break;
912 
913  case DataType::S32:
914  switch(_dst->info()->data_type())
915  {
916  case DataType::F16:
917  {
918  /* Down-conversion S32 -> F16 */
919  ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
920  uk->ukernel(_src, _dst, info, _policy, window);
921  break;
922  }
923  case DataType::F32:
924  {
925  /* Conversion S32 -> F32 */
926  execute_window_loop(win, [&](const Coordinates &)
927  {
928  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
929  const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
930 
931  int x = window_start_x;
932  for(; x <= (window_end_x - window_step_x); x += window_step_x)
933  {
934  const int32x4x4_t texels =
935  {
936  {
937  vld1q_s32(src_ptr + x),
938  vld1q_s32(src_ptr + x + 4),
939  vld1q_s32(src_ptr + x + 8),
940  vld1q_s32(src_ptr + x + 12),
941  }
942  };
943 
944  vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
945  vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
946  vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
947  vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
948  }
949 
950  // Compute left-over elements
951  for(; x < window_end_x; ++x)
952  {
953  *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
954  }
955  },
956  src, dst);
957  break;
958  }
960  {
961  /* Down-conversion S32 -> QASYMM8_SIGNED */
962  if(ConvertPolicy::SATURATE == _policy)
963  {
964  execute_window_loop(win, [&](const Coordinates &)
965  {
966  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
967  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
968 
969  int x = window_start_x;
970  for(; x <= (window_end_x - window_step_x); x += window_step_x)
971  {
972  const int32x4x4_t texels =
973  {
974  {
975  vld1q_s32(src_ptr + x),
976  vld1q_s32(src_ptr + x + 4),
977  vld1q_s32(src_ptr + x + 8),
978  vld1q_s32(src_ptr + x + 12),
979  }
980  };
981  vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]), vqmovn_s32(texels.val[1]))));
982  vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]), vqmovn_s32(texels.val[3]))));
983  }
984 
985  // Compute left-over elements
986  for(; x < window_end_x; ++x)
987  {
988  *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
989  }
990  },
991  src, dst);
992  }
993  else
994  {
995  execute_window_loop(win, [&](const Coordinates &)
996  {
997  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
998  const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
999 
1000  int x = window_start_x;
1001  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1002  {
1003  const int32x4x4_t texels =
1004  {
1005  {
1006  vld1q_s32(src_ptr + x),
1007  vld1q_s32(src_ptr + x + 4),
1008  vld1q_s32(src_ptr + x + 8),
1009  vld1q_s32(src_ptr + x + 12)
1010  }
1011  };
1012 
1013  vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]), vmovn_s32(texels.val[1]))));
1014  vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]), vmovn_s32(texels.val[3]))));
1015  }
1016 
1017  // Compute left-over elements
1018  for(; x < window_end_x; ++x)
1019  {
1020  *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
1021  }
1022  },
1023  src, dst);
1024  }
1025  break;
1026  }
1027  case DataType::QASYMM8:
1028  case DataType::U8:
1029  {
1030  /* Down-conversion S32 -> U8 */
1031  if(ConvertPolicy::SATURATE == _policy)
1032  {
1033  execute_window_loop(win, [&](const Coordinates &)
1034  {
1035  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1036  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1037 
1038  int x = window_start_x;
1039  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1040  {
1041  const int32x4x4_t texels =
1042  {
1043  {
1044  vld1q_s32(src_ptr + x),
1045  vld1q_s32(src_ptr + x + 4),
1046  vld1q_s32(src_ptr + x + 8),
1047  vld1q_s32(src_ptr + x + 12)
1048  }
1049  };
1050  vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]), vqmovun_s32(texels.val[1]))));
1051  vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]), vqmovun_s32(texels.val[3]))));
1052  }
1053 
1054  // Compute left-over elements
1055  for(; x < window_end_x; ++x)
1056  {
1057  *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
1058  }
1059  },
1060  src, dst);
1061  }
1062  else
1063  {
1064  execute_window_loop(win, [&](const Coordinates &)
1065  {
1066  const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
1067  const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
1068 
1069  int x = window_start_x;
1070  for(; x <= (window_end_x - window_step_x); x += window_step_x)
1071  {
1072  const int32x4x4_t texels =
1073  {
1074  {
1075  vld1q_s32(src_ptr + x),
1076  vld1q_s32(src_ptr + x + 4),
1077  vld1q_s32(src_ptr + x + 8),
1078  vld1q_s32(src_ptr + x + 12)
1079  }
1080  };
1081 
1082  vst1_u8(dst_ptr + x, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])), vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
1083  vst1_u8(dst_ptr + x + 8, vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])), vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
1084  }
1085 
1086  // Compute left-over elements
1087  for(; x < window_end_x; ++x)
1088  {
1089  *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
1090  }
1091  },
1092  src, dst);
1093  }
1094  break;
1095  }
1096  default:
1097  ARM_COMPUTE_ERROR("dst data type not supported");
1098  }
1099  break;
1100  default:
1101  ARM_COMPUTE_ERROR("Not supported");
1102  }
1103 }
1104 
1105 const char *CpuCastKernel::name() const
1106 {
1107  return "CpuCastKernel.cpp";
1108 }
1109 
1110 const std::vector<CpuCastKernel::CastKernel> &CpuCastKernel::get_available_kernels()
1111 {
1112  return available_kernels;
1113 }
1114 
1115 } // namespace kernels
1116 } // namespace cpu
1117 } // namespace arm_compute
void configure(const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy)
Set the src and dst of the kernel.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
static const auto * get_implementation(const SelectorType &selector, KernelSelectionType selection_type=KernelSelectionType::Supported)
Micro-kernel selector.
Definition: ICpuKernel.h:53
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:115
#define REGISTER_FP16_NEON(func_name)
Definition: Registrars.h:48
static const std::vector< CastKernel > & get_available_kernels()
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(tensor)
Definition: Validate.h:121
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
1 channel, 1 U8 per channel
void neon_s32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
1 channel, 1 F32 per channel
void neon_bfloat16_to_fp32_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:79
1 channel, 1 U16 per channel
Status class.
Definition: Error.h:52
Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const PadStrideInfo &conv_info)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Interface for CPU tensor.
Definition: ITensor.h:36
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2022 Arm Limited.
1 channel, 1 F16 per channel
void neon_fp32_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
1 channel, 1 S32 per channel
16-bit brain floating-point number
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:54
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
1 channel, 1 U32 per channel
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
quantized, asymmetric fixed-point 8-bit number unsigned
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
#define REGISTER_BF16_NEON(func_name)
Definition: Registrars.h:179
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:915
1 channel, 1 S16 per channel
void neon_u8_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:64
Information about executing thread and CPU.
Definition: CPPTypes.h:179
void neon_fp16_to_other_dt_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:439
static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy)
Static function to check if given info will lead to a valid configuration.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:788
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:157
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
quantized, asymmetric fixed-point 8-bit number signed
Includes all wrapper headers at once.
static CPUInfo & get()
Access the KernelLibrary singleton.
Definition: CPPTypes.cpp:40
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:102
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:97
Describe a multidimensional execution window.
Definition: Window.h:39
ConvertPolicy
Policy to handle integer overflow.
Definition: Types.h:404
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:201
cpuinfo::CpuIsaInfo get_isa() const
Gets the current cpu&#39;s ISA information.
Definition: CPPTypes.cpp:124
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:159
void neon_fp32_to_bfloat16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)
void neon_qasymm8_signed_to_fp16_cast(const ITensor *_src, ITensor *_dst, const ThreadInfo &tensor, ConvertPolicy _policy, const Window &window)