Compute Library
 21.02
NELogicalKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
31 
32 #include <arm_neon.h>
33 
34 namespace arm_compute
35 {
36 namespace kernels
37 {
38 namespace
39 {
40 static const uint8x8_t c0_x8 = vdup_n_u8(0);
41 static const uint8x16_t c0_x16 = vdupq_n_u8(0);
42 static const uint8x8_t c1_x8 = vdup_n_u8(1);
43 static const uint8x16_t c1_x16 = vdupq_n_u8(1);
44 static const int step = 16;
45 static const int half_step = step / 2;
46 
47 void neon_logical_and(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
48 {
52  ARM_COMPUTE_ASSERT(len >= 0);
53 
54  for(; len >= step; len -= step)
55  {
56  vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
57  src0 += step;
58  src1 += step;
59  dst += step;
60  }
61 
62  for(; len >= half_step; len -= half_step)
63  {
64  vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
65  src0 += half_step;
66  src1 += half_step;
67  dst += half_step;
68  }
69 
70  for(; len > 0; --len)
71  {
72  *dst = (*src0) && (*src1);
73  ++src0;
74  ++src1;
75  ++dst;
76  }
77 }
78 
79 void neon_logical_and_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
80 {
83  ARM_COMPUTE_ASSERT(len >= 0);
84 
85  const auto broadcast_val_clamped_s = std::min<uint8_t>(broadcast_val, 1);
86  const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
87  const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
88 
89  for(; len >= step; len -= step)
90  {
91  vst1q_u8(dst, vandq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
92  src += step;
93  dst += step;
94  }
95 
96  for(; len >= half_step; len -= half_step)
97  {
98  vst1_u8(dst, vand_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
99  src += half_step;
100  dst += half_step;
101  }
102 
103  for(; len > 0; --len)
104  {
105  *dst = (*src) && broadcast_val_clamped_s;
106  ++src;
107  ++dst;
108  }
109 }
110 
111 void neon_logical_or(const uint8_t *src0, const uint8_t *src1, uint8_t *dst, int len)
112 {
116  ARM_COMPUTE_ASSERT(len >= 0);
117 
118  for(; len >= step; len -= step)
119  {
120  vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src0), c1_x16), vminq_u8(vld1q_u8(src1), c1_x16)));
121  src0 += step;
122  src1 += step;
123  dst += step;
124  }
125 
126  for(; len >= half_step; len -= half_step)
127  {
128  vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src0), c1_x8), vmin_u8(vld1_u8(src1), c1_x8)));
129  src0 += half_step;
130  src1 += half_step;
131  dst += half_step;
132  }
133 
134  for(; len > 0; --len)
135  {
136  *dst = (*src0) || (*src1);
137  ++src0;
138  ++src1;
139  ++dst;
140  }
141 }
142 
143 void neon_logical_or_broadcast(const uint8_t *src, uint8_t broadcast_val, uint8_t *dst, int len)
144 {
147  ARM_COMPUTE_ASSERT(len >= 0);
148 
149  const auto broadcast_val_clamped_s = std::min<uint8_t>(broadcast_val, 1);
150  const auto broadcast_val_clamped_x16 = vdupq_n_u8(broadcast_val_clamped_s);
151  const auto broadcast_val_clamped_x8 = vdup_n_u8(broadcast_val_clamped_s);
152 
153  for(; len >= step; len -= step)
154  {
155  vst1q_u8(dst, vorrq_u8(vminq_u8(vld1q_u8(src), c1_x16), broadcast_val_clamped_x16));
156  src += step;
157  dst += step;
158  }
159 
160  for(; len >= half_step; len -= half_step)
161  {
162  vst1_u8(dst, vorr_u8(vmin_u8(vld1_u8(src), c1_x8), broadcast_val_clamped_x8));
163  src += half_step;
164  dst += half_step;
165  }
166 
167  for(; len > 0; --len)
168  {
169  *dst = (*src) || broadcast_val_clamped_s;
170  ++src;
171  ++dst;
172  }
173 }
174 
175 void neon_logical_not(const uint8_t *src, uint8_t *dst, int len)
176 {
179  ARM_COMPUTE_ASSERT(len >= 0);
180 
181  for(; len >= step; len -= step)
182  {
183  vst1q_u8(dst, vbslq_u8(vceqq_u8(vld1q_u8(src), c0_x16), c1_x16, c0_x16));
184  src += step;
185  dst += step;
186  }
187 
188  for(; len >= half_step; len -= half_step)
189  {
190  vst1_u8(dst, vbsl_u8(vceq_u8(vld1_u8(src), c0_x8), c1_x8, c0_x8));
191  src += half_step;
192  dst += half_step;
193  }
194 
195  for(; len > 0; --len)
196  {
197  *dst = !(*src);
198  ++src;
199  ++dst;
200  }
201 }
202 
203 void run_unary(const Window &window, const ITensor *src, ITensor *dst)
204 {
205  Window win{ window };
206  win.set(Window::DimX, Window::Dimension(0, 1, 1));
207  const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
208 
209  Iterator in(src, win);
210  Iterator out(dst, win);
211 
212  execute_window_loop(win, [&](const Coordinates &)
213  {
214  neon_logical_not(in.ptr(), out.ptr(), len);
215  },
216  in, out);
217 }
218 
219 void run_binary(const Window &window, const ITensor *src0, const ITensor *src1, ITensor *dst, LogicalOperation op)
220 {
221  Window src0_win = window.broadcast_if_dimension_le_one(src0->info()->tensor_shape());
222  Window src1_win = window.broadcast_if_dimension_le_one(src1->info()->tensor_shape());
223 
224  Window win{ window };
225  win.set(Window::DimX, Window::Dimension(0, 1, 1));
226 
227  const bool is_broadcast_across_x = src0->info()->tensor_shape().x() != src1->info()->tensor_shape().x();
228  const auto len = static_cast<int>(window.x().end()) - static_cast<int>(window.x().start());
229 
230  if(is_broadcast_across_x)
231  {
233  LogicalBroadcastUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or_broadcast : &neon_logical_and_broadcast;
234 
235  const bool is_broadcast_input_1 = src1_win.x().step() == 0;
236  Window broadcast_win = is_broadcast_input_1 ? src1_win : src0_win;
237  Window non_broadcast_win = !is_broadcast_input_1 ? src1_win : src0_win;
238  const ITensor *broadcast_tensor = is_broadcast_input_1 ? src1 : src0;
239  const ITensor *non_broadcast_tensor = !is_broadcast_input_1 ? src1 : src0;
240  non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
241 
242  Iterator broadcast_in(broadcast_tensor, broadcast_win);
243  Iterator non_broadcast_in(non_broadcast_tensor, non_broadcast_win);
244  Iterator out(dst, win);
245 
246  execute_window_loop(win, [&](const Coordinates &)
247  {
248  const uint8_t broadcast_value = *broadcast_in.ptr();
249  logical_func(non_broadcast_in.ptr(), broadcast_value, out.ptr(), len);
250 
251  },
252  broadcast_in, non_broadcast_in, out);
253  }
254  else
255  {
257  LogicalUKernelPtr logical_func = op == LogicalOperation::Or ? &neon_logical_or : &neon_logical_and;
258 
259  src0_win.set(Window::DimX, Window::Dimension(0, 1, 1));
260  src1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
261 
262  Iterator in0(src0, src0_win);
263  Iterator in1(src1, src1_win);
264  Iterator out(dst, win);
265  execute_window_loop(win, [&](const Coordinates &)
266  {
267  logical_func(in0.ptr(), in1.ptr(), out.ptr(), len);
268  },
269  in0, in1, out);
270  }
271 }
272 } // namespace
273 const char *NELogicalKernel::name() const
274 {
275  return "NELogicalKernel";
276 }
277 
278 void NELogicalKernel::configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
279 {
280  ARM_COMPUTE_ERROR_ON_NULLPTR(input1, output);
281  ARM_COMPUTE_ERROR_THROW_ON(validate(input1, input2, output, op));
282 
283  _op = op;
284 
285  Window win = calculate_max_window(*input1, Steps());
286  TensorShape out_shape = input1->tensor_shape();
287  if(op != LogicalOperation::Not)
288  {
290  const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(*input1, *input2);
291  out_shape = broadcast_pair.first;
292  win = calculate_max_window(broadcast_pair.second, Steps());
293  }
294  ICPPKernel::configure(win);
295 
296  // Auto initialize if empty
297  set_shape_if_empty(*output, out_shape);
298  set_data_type_if_unknown(*output, input1->data_type());
299 }
300 
301 Status NELogicalKernel::validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
302 {
305 
306  TensorShape out_shape = input1->tensor_shape();
307  if(op != LogicalOperation::Not)
308  {
309  out_shape = TensorShape::broadcast_shape(input1->tensor_shape(), input2->tensor_shape());
310  ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible");
312  }
313 
314  // Checks performed when output is configured
315  if((output != nullptr) && (output->total_size() != 0))
316  {
319  }
320 
321  return Status{};
322 }
323 
325 {
326  ARM_COMPUTE_UNUSED(info);
329  ARM_COMPUTE_ERROR_ON(tensors.empty());
330 
331  const ITensor *src0 = tensors.get_const_tensor(TensorType::ACL_SRC_0);
332  const ITensor *src1 = tensors.get_const_tensor(TensorType::ACL_SRC_1);
334 
335  if(_op == LogicalOperation::Not)
336  {
337  run_unary(window, src0, dst);
338  }
339  else
340  {
341  run_binary(window, src0, src1, dst, _op);
342  }
343 }
344 } // namespace kernels
345 } // namespace arm_compute
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
Shape of a tensor.
Definition: TensorShape.h:39
bool set_data_type_if_unknown(ITensorInfo &info, DataType data_type)
Set the data type and number of channels to the specified value if the current data type is unknown...
bool empty() const
Checks if pack is empty.
Definition: ITensorPack.cpp:61
constexpr int step() const
Return the step of the dimension.
Definition: Window.h:104
1 channel, 1 U8 per channel
virtual DataType data_type() const =0
Data type used for each element of the tensor.
static TensorShape broadcast_shape(const Shapes &... shapes)
If shapes are broadcast compatible, return the broadcasted shape.
Definition: TensorShape.h:211
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
static std::pair< TensorShape, ValidRegion > broadcast_shape_and_valid_region(const Infos &... infos)
If infos are broadcast compatible tensor info&#39;s, return the broadcasted shape and the intersection of...
Definition: ITensorInfo.h:271
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
Definition: ITensor.h:36
SimpleTensor< float > src
Definition: DFT.cpp:155
Copyright (c) 2017-2021 Arm Limited.
static Status validate(const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, LogicalOperation op)
Static function to check if given info will lead to a valid configuration of NELogicalKernel.
T x() const
Alias to access the size of the first dimension.
Definition: Dimensions.h:87
const ITensor * get_const_tensor(int id) const
Get constant tensor of a given id.
Definition: ITensorPack.cpp:40
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
size_t total_size() const
Collapses all dimensions to a single linear total size.
Definition: TensorShape.h:172
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
bool set_shape_if_empty(ITensorInfo &info, const TensorShape &shape)
Set the shape to the specified value if the current assignment is empty.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
bool have_different_dimensions(const Dimensions< T > &dim1, const Dimensions< T > &dim2, unsigned int upper_dim)
Definition: Validate.h:51
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
#define ARM_COMPUTE_ASSERT_NOT_NULLPTR(ptr)
Definition: Validate.h:38
Window broadcast_if_dimension_le_one(const TensorShape &shape) const
Don&#39;t advance in the dimension where shape is less equal to 1.
Definition: Window.inl:120
const char * name() const override
Name of the kernel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
ITensor * get_tensor(int id)
Get tensor of a given id from the pac.
Definition: ITensorPack.cpp:50
Information about executing thread and CPU.
Definition: CPPTypes.h:235
virtual size_t total_size() const =0
Returns the total size of the tensor in bytes.
constexpr int step
Definition: fp32.cpp:35
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
Tensor packing service.
Definition: ITensorPack.h:37
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
#define ARM_COMPUTE_ASSERT(cond)
Definition: Validate.h:37
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
void configure(const ITensorInfo *input1, const ITensorInfo *input2, ITensorInfo *output, LogicalOperation op)
Initialise the kernel&#39;s inputs and output.
LogicalOperation
List of supported logical operations.
Definition: KernelTypes.h:30
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145