Casts a given tensor to a new type. More...

#include <CpuCastKernel.h>

Collaboration diagram for CpuCastKernel:

Data Structures
struct	CastKernel

Public Member Functions
	CpuCastKernel ()=default

	ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE (CpuCastKernel)

void	configure (const ITensorInfo src, ITensorInfo dst, ConvertPolicy policy)
	Set the src and dst of the kernel. More...

void	run_op (ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
	Execute the kernel on the passed window. More...

const char *	name () const override
	Name of the kernel. More...

Public Member Functions inherited from ICPPKernel
virtual	~ICPPKernel ()=default
	Default destructor. More...

virtual void	run (const Window &window, const ThreadInfo &info)
	Execute the kernel on the passed window. More...

virtual void	run_nd (const Window &window, const ThreadInfo &info, const Window &thread_locator)
	legacy compatibility layer for implemantions which do not support thread_locator In these cases we simply narrow the interface down the legacy version More...

virtual size_t	get_mws (const CPUInfo &platform, size_t thread_count) const
	Return minimum workload size of the relevant kernel. More...

Public Member Functions inherited from IKernel
	IKernel ()
	Constructor. More...

virtual	~IKernel ()=default
	Destructor. More...

virtual bool	is_parallelisable () const
	Indicates whether or not the kernel is parallelisable. More...

virtual BorderSize	border_size () const
	The size of the border for that kernel. More...

const Window &	window () const
	The maximum window the kernel can be executed on. More...

bool	is_window_configured () const
	Function to check if the embedded window of this kernel has been configured. More...

Static Public Member Functions
static Status	validate (const ITensorInfo src, const ITensorInfo dst, ConvertPolicy policy)
	Static function to check if given info will lead to a valid configuration. More...

static const std::vector< CastKernel > &	get_available_kernels ()

Static Public Member Functions inherited from ICpuKernel< CpuCastKernel >
static const auto *	get_implementation (const SelectorType &selector, KernelSelectionType selection_type=KernelSelectionType::Supported)
	Micro-kernel selector. More...

Additional Inherited Members
Static Public Attributes inherited from ICPPKernel
static constexpr size_t	default_mws = 1

Detailed Description

Casts a given tensor to a new type.

Note: When casting between quantized types the scale and zeroPoint are ignored

Definition at line 40 of file CpuCastKernel.h.

Constructor & Destructor Documentation

◆ CpuCastKernel()

CpuCastKernel ( )

default

Member Function Documentation

◆ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE()

ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE ( CpuCastKernel )

◆ configure()

void configure	(	const ITensorInfo *	src,
		ITensorInfo *	dst,
		ConvertPolicy	policy
	)

Set the src and dst of the kernel.

Valid conversions src -> dst :

QASYMM8_SIGNED -> S16, S32, F32, F16
QASYMM8 -> U16, S16, S32, F32, F16
U8 -> U16, S16, S32, F32, F16
U16 -> U8, U32
S16 -> QASYMM8_SIGNED, U8, S32
F16 -> QASYMM8_SIGNED, QASYMM8, F32, S32, U8
S32 -> QASYMM8_SIGNED, QASYMM8, F16, F32, U8
S64 -> F32
F32 -> QASYMM8_SIGNED, QASYMM8, F16, S32, U8

Parameters

[in]	src	The src tensor to convert. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/S32/S64/F16/F32.
[out]	dst	The dst tensor. Data types supported: QASYMM8_SIGNED/QASYMM8/U8/U16/S16/U32/S32/S64/F16/F32.
[in]	policy	Conversion policy.

Note: S64 is only supported in aarch64

Definition at line 163 of file CpuCastKernel.cpp.

 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
  
     // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given)
     set_shape_if_empty(*dst, src->tensor_shape());
  
     _policy = policy;
  
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy));
  
     // Configure kernel window
     Window win = calculate_max_window(*src, Steps());
  
     ICPPKernel::configure(win);
 }

References ARM_COMPUTE_ERROR_ON_NULLPTR, ARM_COMPUTE_ERROR_THROW_ON, arm_compute::calculate_max_window(), arm_compute::test::validation::dst, arm_compute::set_shape_if_empty(), arm_compute::test::validation::src, and arm_compute::cpu::kernels::validate_arguments().

◆ get_available_kernels()

const std::vector< CpuCastKernel::CastKernel > & get_available_kernels ( )

static

Definition at line 1172 of file CpuCastKernel.cpp.

 {
     return available_kernels;
 }

◆ name()

const char * name ( ) const

overridevirtual

Name of the kernel.

Returns: Kernel name

Implements ICPPKernel.

Definition at line 1167 of file CpuCastKernel.cpp.

 {
     return "CpuCastKernel.cpp";
 }

◆ run_op()

void run_op	(	ITensorPack &	tensors,
		const Window &	window,
		const ThreadInfo &	info
	)

overridevirtual

Execute the kernel on the passed window.

Warning: If is_parallelisable() returns false then the passed window must be equal to window()

Note: The window has to be a region within the window returned by the window() method; The width of the window has to be a multiple of num_elems_processed_per_iteration().

Parameters

[in]	tensors	A vector containing the tensors to operate on.
[in]	window	Region on which to execute the kernel. (Must be a region of the window returned by window())
[in]	info	Info about executing thread and CPU.

Reimplemented from ICPPKernel.

Definition at line 271 of file CpuCastKernel.cpp.

 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
     ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window);
  
     const auto window_start_x = static_cast<int>(window.x().start());
     const auto window_end_x   = static_cast<int>(window.x().end());
     const int  window_step_x  = 16;
  
     const ITensor *_src = tensors.get_const_tensor(TensorType::ACL_SRC);
     ITensor       *_dst = tensors.get_tensor(TensorType::ACL_DST);
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
     ARM_COMPUTE_ERROR_ON(_src == _dst);
  
     ARM_COMPUTE_ERROR_ON_NULLPTR(_src, _dst);
  
     Window win{window};
     win.set(Window::DimX, Window::Dimension(0, 1, 1));
  
     Iterator src(_src, win);
     Iterator dst(_dst, win);
  
     /*ukernel runs only when using fp16, so we validate it isn't a nullptr only before using it */
     const auto *uk = CpuCastKernel::get_implementation(
         CastDataTypeISASelectorData{_src->info()->data_type(), _dst->info()->data_type(), CPUInfo::get().get_isa()});
  
     switch (_src->info()->data_type())
     {
 #ifdef __aarch64__
         case DataType::U64:
         {
             switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
                     convert64<uint64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x);
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         }
         case DataType::S64:
         {
             switch (_dst->info()->data_type())
             {
                 case DataType::F32:
                 {
                     convert64<int64_t, float>(src, dst, win, window_start_x, window_end_x, window_step_x);
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         }
 #endif // __aarch64__
  
         case DataType::QASYMM8_SIGNED:
         {
             switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S16 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
                             int        x       = window_start_x;
  
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
  
                                 const int16x8x2_t texels = {
                                     {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
  
                                 vst1q_s16(dst_ptr + x, texels.val[0]);
                                 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<int16_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> S32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
                             int        x       = window_start_x;
  
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
  
                                 const int16x8x2_t texels = {
                                     {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
  
                                 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
                                 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
                                 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
                                 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> F32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const int8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const int8x16_t texels_s8 = vld1q_s8(src_ptr + x);
  
                                 const int16x8x2_t texels = {
                                     {vmovl_s8(vget_low_s8(texels_s8)), vmovl_s8(vget_high_s8(texels_s8))}};
                                 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
                                 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
                                 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
                                 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::F16:
                 {
                     /* Up-conversion QASYMM8_SIGNED -> F16 */
                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
                     uk->ukernel(_src, _dst, info, _policy, window);
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         }
  
         case DataType::QASYMM8:
         case DataType::U8:
         {
             switch (_dst->info()->data_type())
             {
                 case DataType::S16:
                 {
                     /* Up-conversion U8 -> S16 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int16_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
  
                                 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
                                                              vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
  
                                 vst1q_s16(dst_ptr + x, texels.val[0]);
                                 vst1q_s16(dst_ptr + x + 8, texels.val[1]);
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion U8 -> S32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
  
                                 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
                                                              vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
  
                                 vst1q_s32(dst_ptr + x, vmovl_s16(vget_low_s16(texels.val[0])));
                                 vst1q_s32(dst_ptr + x + 4, vmovl_s16(vget_high_s16(texels.val[0])));
                                 vst1q_s32(dst_ptr + x + 8, vmovl_s16(vget_low_s16(texels.val[1])));
                                 vst1q_s32(dst_ptr + x + 12, vmovl_s16(vget_high_s16(texels.val[1])));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Up-conversion U8 -> F32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
  
                                 const int16x8x2_t texels = {{vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))),
                                                              vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8)))}};
                                 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[0]))));
                                 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[0]))));
                                 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(vmovl_s16(vget_low_s16(texels.val[1]))));
                                 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(vmovl_s16(vget_high_s16(texels.val[1]))));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::F16:
                 {
                     /* Up-conversion U8 -> FP16 */
                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
                     uk->ukernel(_src, _dst, info, _policy, window);
                     break;
                 }
                 case DataType::U16:
                 {
                     /* Up-conversion U8 -> U16 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const uint8_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<uint16_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const uint8x16_t texels_u8 = vld1q_u8(src_ptr + x);
  
                                 const uint16x8x2_t texels = {
                                     {vmovl_u8(vget_low_u8(texels_u8)), vmovl_u8(vget_high_u8(texels_u8))}};
  
                                 vst1q_u16(dst_ptr + x, texels.val[0]);
                                 vst1q_u16(dst_ptr + x + 8, texels.val[1]);
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<uint16_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         }
         case DataType::S16:
         {
             switch (_dst->info()->data_type())
             {
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S16 -> QASYMM8_SIGNED */
                     if (ConvertPolicy::SATURATE == _policy)
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
  
                                     vst1q_s8(dst_ptr + x,
                                              vcombine_s8(vqmovn_s16(texels.val[0]), vqmovn_s16(texels.val[1])));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     else
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
  
                                     vst1q_s8(dst_ptr + x,
                                              vcombine_s8(vmovn_s16(texels.val[0]), vmovn_s16(texels.val[1])));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     break;
                 }
                 case DataType::U8:
                 {
                     /* Down-conversion S16 -> U8 */
                     if (ConvertPolicy::SATURATE == _policy)
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
  
                                     vst1q_u8(dst_ptr + x,
                                              vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1])));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     else
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
  
                                     vst1q_u8(dst_ptr + x, vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])),
                                                                       vmovn_u16(vreinterpretq_u16_s16(texels.val[1]))));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Up-conversion S16 -> S32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const int16_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const int16x8x2_t texels = {{vld1q_s16(src_ptr + x), vld1q_s16(src_ptr + x + 8)}};
  
                                 const int32x4x4_t texels_s32 = {
                                     {vmovl_s16(vget_low_s16(texels.val[0])), vmovl_s16(vget_high_s16(texels.val[0])),
                                      vmovl_s16(vget_low_s16(texels.val[1])), vmovl_s16(vget_high_s16(texels.val[1]))}};
  
                                 vst1q_s32(dst_ptr + x, texels_s32.val[0]);
                                 vst1q_s32(dst_ptr + x + 4, texels_s32.val[1]);
                                 vst1q_s32(dst_ptr + x + 8, texels_s32.val[2]);
                                 vst1q_s32(dst_ptr + x + 12, texels_s32.val[3]);
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         }
  
         case DataType::U16:
         {
             switch (_dst->info()->data_type())
             {
                 case DataType::U8:
                 {
                     /* Down-conversion U16 -> U8 */
                     if (ConvertPolicy::SATURATE == _policy)
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
  
                                     vst1q_u8(dst_ptr + x,
                                              vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1])));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     else
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
  
                                     vst1q_u8(dst_ptr + x,
                                              vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1])));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     break;
                 }
                 case DataType::U32:
                 {
                     /* Up-conversion U16 -> U32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const uint16_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<uint32_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const uint16x8x2_t texels = {{vld1q_u16(src_ptr + x), vld1q_u16(src_ptr + x + 8)}};
  
                                 vst1q_u32(dst_ptr + x, vmovl_u16(vget_low_u16(texels.val[0])));
                                 vst1q_u32(dst_ptr + x + 4, vmovl_u16(vget_high_u16(texels.val[0])));
                                 vst1q_u32(dst_ptr + x + 8, vmovl_u16(vget_low_u16(texels.val[1])));
                                 vst1q_u32(dst_ptr + x + 12, vmovl_u16(vget_high_u16(texels.val[1])));
                             }
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<uint32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         }
         case DataType::F16:
         {
             /* conversion F16 -> any data type */
             ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
             uk->ukernel(_src, _dst, info, _policy, window);
             break;
         }
         case DataType::F32:
             switch (_dst->info()->data_type())
             {
                 case DataType::F16:
                 {
                     /* Down-conversion F32 -> F16 */
                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
                     uk->ukernel(_src, _dst, info, _policy, window);
                     break;
                 }
                 case DataType::S32:
                 {
                     /* Conversion F32 -> S32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int32_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
                                 }};
  
                                 vst1q_s32(dst_ptr + x, vcvtq_s32_f32(texels.val[0]));
                                 vst1q_s32(dst_ptr + x + 4, vcvtq_s32_f32(texels.val[1]));
                                 vst1q_s32(dst_ptr + x + 8, vcvtq_s32_f32(texels.val[2]));
                                 vst1q_s32(dst_ptr + x + 12, vcvtq_s32_f32(texels.val[3]));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<int32_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::QASYMM8:
                 case DataType::U8:
                 {
                     /* Down-conversion F32 -> U8 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
                                 }};
  
                                 vst1_u8(dst_ptr + x,
                                         vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[0])),
                                                                 vqmovun_s32(vcvtq_s32_f32(texels.val[1])))));
                                 vst1_u8(dst_ptr + x + 8,
                                         vqmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(texels.val[2])),
                                                                 vqmovun_s32(vcvtq_s32_f32(texels.val[3])))));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion F32 -> QASYMM8_SIGNED */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const float *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const float32x4x4_t texels = {{
                                     vld1q_f32(src_ptr + x),
                                     vld1q_f32(src_ptr + x + 4),
                                     vld1q_f32(src_ptr + x + 8),
                                     vld1q_f32(src_ptr + x + 12),
                                 }};
  
                                 vst1_s8(dst_ptr + x,
                                         vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[0])),
                                                                 vqmovn_s32(vcvtq_s32_f32(texels.val[1])))));
                                 vst1_s8(dst_ptr + x + 8,
                                         vqmovn_s16(vcombine_s16(vqmovn_s32(vcvtq_s32_f32(texels.val[2])),
                                                                 vqmovn_s32(vcvtq_s32_f32(texels.val[3])))));
                             }
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
  
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         case DataType::S32:
             switch (_dst->info()->data_type())
             {
 #if __aarch64__
                 case DataType::S64:
                 {
                     convert64<int32_t, int64_t>(src, dst, win, window_start_x, window_end_x, window_step_x);
                     break;
                 }
 #endif // __aarch64__
                 case DataType::F16:
                 {
                     /* Down-conversion S32 -> F16 */
                     ARM_COMPUTE_ERROR_ON(uk->ukernel == nullptr);
                     uk->ukernel(_src, _dst, info, _policy, window);
                     break;
                 }
                 case DataType::F32:
                 {
                     /* Conversion S32 -> F32 */
                     execute_window_loop(
                         win,
                         [&](const Coordinates &)
                         {
                             const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
                             const auto dst_ptr = reinterpret_cast<float *>(dst.ptr());
  
                             int x = window_start_x;
                             for (; x <= (window_end_x - window_step_x); x += window_step_x)
                             {
                                 const int32x4x4_t texels = {{
                                     vld1q_s32(src_ptr + x),
                                     vld1q_s32(src_ptr + x + 4),
                                     vld1q_s32(src_ptr + x + 8),
                                     vld1q_s32(src_ptr + x + 12),
                                 }};
  
                                 vst1q_f32(dst_ptr + x, vcvtq_f32_s32(texels.val[0]));
                                 vst1q_f32(dst_ptr + x + 4, vcvtq_f32_s32(texels.val[1]));
                                 vst1q_f32(dst_ptr + x + 8, vcvtq_f32_s32(texels.val[2]));
                                 vst1q_f32(dst_ptr + x + 12, vcvtq_f32_s32(texels.val[3]));
                             }
  
                             // Compute left-over elements
                             for (; x < window_end_x; ++x)
                             {
                                 *(dst_ptr + x) = static_cast<float>(*(src_ptr + x));
                             }
                         },
                         src, dst);
                     break;
                 }
                 case DataType::QASYMM8_SIGNED:
                 {
                     /* Down-conversion S32 -> QASYMM8_SIGNED */
                     if (ConvertPolicy::SATURATE == _policy)
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int32x4x4_t texels = {{
                                         vld1q_s32(src_ptr + x),
                                         vld1q_s32(src_ptr + x + 4),
                                         vld1q_s32(src_ptr + x + 8),
                                         vld1q_s32(src_ptr + x + 12),
                                     }};
                                     vst1_s8(dst_ptr + x, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[0]),
                                                                                  vqmovn_s32(texels.val[1]))));
                                     vst1_s8(dst_ptr + x + 8, vqmovn_s16(vcombine_s16(vqmovn_s32(texels.val[2]),
                                                                                      vqmovn_s32(texels.val[3]))));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = utils::cast::saturate_cast<int8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     else
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<int8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
                                                                  vld1q_s32(src_ptr + x + 8),
                                                                  vld1q_s32(src_ptr + x + 12)}};
  
                                     vst1_s8(dst_ptr + x, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[0]),
                                                                                 vmovn_s32(texels.val[1]))));
                                     vst1_s8(dst_ptr + x + 8, vmovn_s16(vcombine_s16(vmovn_s32(texels.val[2]),
                                                                                     vmovn_s32(texels.val[3]))));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = static_cast<int8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     break;
                 }
                 case DataType::QASYMM8:
                 case DataType::U8:
                 {
                     /* Down-conversion S32 -> U8 */
                     if (ConvertPolicy::SATURATE == _policy)
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
                                                                  vld1q_s32(src_ptr + x + 8),
                                                                  vld1q_s32(src_ptr + x + 12)}};
                                     vst1_u8(dst_ptr + x, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[0]),
                                                                                  vqmovun_s32(texels.val[1]))));
                                     vst1_u8(dst_ptr + x + 8, vqmovn_u16(vcombine_u16(vqmovun_s32(texels.val[2]),
                                                                                      vqmovun_s32(texels.val[3]))));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = utils::cast::saturate_cast<uint8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     else
                     {
                         execute_window_loop(
                             win,
                             [&](const Coordinates &)
                             {
                                 const auto src_ptr = reinterpret_cast<const int32_t *>(src.ptr());
                                 const auto dst_ptr = reinterpret_cast<uint8_t *>(dst.ptr());
  
                                 int x = window_start_x;
                                 for (; x <= (window_end_x - window_step_x); x += window_step_x)
                                 {
                                     const int32x4x4_t texels = {{vld1q_s32(src_ptr + x), vld1q_s32(src_ptr + x + 4),
                                                                  vld1q_s32(src_ptr + x + 8),
                                                                  vld1q_s32(src_ptr + x + 12)}};
  
                                     vst1_u8(dst_ptr + x,
                                             vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[0])),
                                                                    vmovn_u32(vreinterpretq_u32_s32(texels.val[1])))));
                                     vst1_u8(dst_ptr + x + 8,
                                             vmovn_u16(vcombine_u16(vmovn_u32(vreinterpretq_u32_s32(texels.val[2])),
                                                                    vmovn_u32(vreinterpretq_u32_s32(texels.val[3])))));
                                 }
  
                                 // Compute left-over elements
                                 for (; x < window_end_x; ++x)
                                 {
                                     *(dst_ptr + x) = static_cast<uint8_t>(*(src_ptr + x));
                                 }
                             },
                             src, dst);
                     }
                     break;
                 }
                 default:
                     ARM_COMPUTE_ERROR("dst data type not supported");
             }
             break;
         default:
             ARM_COMPUTE_ERROR("Not supported");
     }
 }

References arm_compute::ACL_DST, arm_compute::ACL_SRC, ARM_COMPUTE_ERROR, ARM_COMPUTE_ERROR_ON, ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW, ARM_COMPUTE_ERROR_ON_NULLPTR, ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL, ARM_COMPUTE_UNUSED, arm_compute::test::validation::data_type, ITensorInfo::data_type(), Window::DimX, arm_compute::test::validation::dst, Window::Dimension::end(), arm_compute::execute_window_loop(), arm_compute::F16, arm_compute::F32, CPUInfo::get(), ITensorPack::get_const_tensor(), ICpuKernel< CpuCastKernel >::get_implementation(), CPUInfo::get_isa(), ITensorPack::get_tensor(), ITensor::info(), arm_compute::test::validation::info, arm_compute::QASYMM8, arm_compute::QASYMM8_SIGNED, arm_compute::S16, arm_compute::S32, arm_compute::S64, arm_compute::SATURATE, Window::set(), arm_compute::test::validation::src, Window::Dimension::start(), arm_compute::U16, arm_compute::U32, arm_compute::U64, arm_compute::U8, IKernel::window(), and Window::x().

◆ validate()

Status validate	(	const ITensorInfo *	src,
		const ITensorInfo *	dst,
		ConvertPolicy	policy
	)

static

Static function to check if given info will lead to a valid configuration.

Similar to CpuCastKernel::configure()

Returns: a status

Definition at line 180 of file CpuCastKernel.cpp.

 {
     ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy));
     return Status{};
 }

References ARM_COMPUTE_RETURN_ON_ERROR, arm_compute::test::validation::dst, arm_compute::test::validation::src, and arm_compute::cpu::kernels::validate_arguments().

Referenced by CpuCast::validate().

The documentation for this class was generated from the following files:

src/cpu/kernels/CpuCastKernel.h
src/cpu/kernels/CpuCastKernel.cpp

Data Structures

Public Member Functions

Static Public Member Functions

Additional Inherited Members