40 static const uint8x8_t c0_x8 = vdup_n_u8(0);
41 static const uint8x16_t c0_x16 = vdupq_n_u8(0);
42 static const uint8x8_t c1_x8 = vdup_n_u8(1);
43 static const uint8x16_t c1_x16 = vdupq_n_u8(1);
44 static const uint32_t
step = 16;
45 static const uint32_t half_step = step / 2;
47 void neon_logical_and(
const uint8_t *src0,
const uint8_t *src1, uint8_t *
dst, uint32_t len)
55 vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
61 for(; len >= half_step; len -= half_step)
63 vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
71 *dst = (*src0) && (*src1);
78 void neon_logical_and_broadcast(
const uint8_t *
src, uint8_t broadcast_val, uint8_t *dst, uint32_t len)
83 const auto broadcast_val_clamped_s = std::min<uint8_t>(broadcast_val, 1);
84 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
85 const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
89 vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
94 for(; len >= half_step; len -= half_step)
96 vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
101 for(; len > 0; --len)
103 *dst = (*src) && broadcast_val_clamped_s;
109 void neon_logical_or(
const uint8_t *src0,
const uint8_t *src1, uint8_t *dst, uint32_t len)
117 vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
123 for(; len >= half_step; len -= half_step)
125 vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
131 for(; len > 0; --len)
133 *dst = (*src0) || (*src1);
140 void neon_logical_or_broadcast(
const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, uint32_t len)
145 const auto broadcast_val_clamped_s = std::min<uint8_t>(broadcast_val, 1);
146 const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
147 const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
151 vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
156 for(; len >= half_step; len -= half_step)
158 vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
163 for(; len > 0; --len)
165 *dst = (*src) || broadcast_val_clamped_s;
171 void neon_logical_not(
const uint8_t *src, uint8_t *dst, uint32_t len)
178 vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
183 for(; len >= half_step; len -= half_step)
185 vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
190 for(; len > 0; --len)
202 const auto len = window.
x().
end() - window.
x().
start();
209 neon_logical_not(in.
ptr(), out.
ptr(), len);
223 const auto len = window.
x().
end() - window.
x().
start();
225 if(is_broadcast_across_x)
228 LogicalBroadcastUKernelPtr logical_func = op ==
LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
230 const bool is_broadcast_input_1 = src1_win.
x().
step() == 0;
231 Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win;
232 Window non_broadcast_win = !is_broadcast_input_1 ? src1_win : src0_win;
233 const ITensor *broadcast_tensor = is_broadcast_input_1 ? src1 : src0;
234 const ITensor *non_broadcast_tensor = !is_broadcast_input_1 ? src1 : src0;
237 Iterator broadcast_in(broadcast_tensor, broadcast_win);
238 Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
243 const uint8_t broadcast_value = *broadcast_in.
ptr();
244 logical_func(non_broadcast_in.
ptr(), broadcast_value, out.
ptr(), len);
247 broadcast_in, non_broadcast_in, out);
252 LogicalUKernelPtr logical_func = op ==
LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
262 logical_func(in0.
ptr(), in1.
ptr(), out.
ptr(), len);
270 return "NELogicalKernel";
288 ICPPKernel::configure(win);
309 if((output !=
nullptr) && (output->
total_size() != 0))
331 run_unary(window, src0, dst);
335 run_binary(window, src0, src1, dst, _op);
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
const Window & window() const
The maximum window the kernel can be executed on.
bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
Set the data type and number of channels to the specified value if the current data type is unknown...
bool empty() const
Checks if pack is empty.
constexpr int step() const
Return the step of the dimension.
1 channel, 1 U8 per channel
virtual DataType data_type() const =0
Data type used for each element of the tensor.
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Describe one of the image's dimensions with a start, end and step.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
SimpleTensor< float > src
Copyright (c) 2017-2022 Arm Limited.
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
Static function to check if given info will lead to a valid configuration of NELogicalKernel.
T x() const
Alias to access the size of the first dimension.
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
size_t total_size() const
Collapses all dimensions to a single linear total size.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don't advance in the dimension where shape is less equal to 1.
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Information about executing thread and CPU.
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
constexpr int end() const
Return the end of the dimension.
Iterator updated by execute_window_loop for each window element.
void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
Initialise the kernel's inputs and output.
LogicalOperation
List of supported logical operations.
#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr)
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
constexpr const Dimension & x() const
Alias to access the first dimension of the window.