Compute Library
 21.02
NEScharr3x3Kernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
30 #include "arm_compute/core/Types.h"
34 
35 #include <arm_neon.h>
36 #include <cstdint>
37 
38 using namespace arm_compute;
39 
40 namespace
41 {
42 const int16x8_t three = vdupq_n_s16(3);
43 const int16x8_t minus_three = vdupq_n_s16(-3);
44 const int16x8_t ten = vdupq_n_s16(10);
45 const int16x8_t minus_ten = vdupq_n_s16(-10);
46 
47 inline int16x8_t scharr_y(const int16x8x2_t &top, const int16x8x2_t &bottom)
48 {
49  // Top left
50  int16x8_t out = vmulq_s16(top.val[0], minus_three);
51  // Top center
52  out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 1), minus_ten);
53  // Top right
54  out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), minus_three);
55 
56  // Bottom left
57  out = vmlaq_s16(out, bottom.val[0], three);
58  // Bottom center
59  out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 1), ten);
60  // Bottom right
61  out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
62 
63  return out;
64 }
65 
66 inline int16x8_t scharr_x(const int16x8x2_t &top, const int16x8x2_t &middle, const int16x8x2_t &bottom)
67 {
68  // Top left
69  int16x8_t out = vmulq_s16(top.val[0], minus_three);
70  // Top right
71  out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), three);
72 
73  // Middle left
74  out = vmlaq_s16(out, middle.val[0], minus_ten);
75  // Middle right
76  out = vmlaq_s16(out, vextq_s16(middle.val[0], middle.val[1], 2), ten);
77 
78  // Bottom left
79  out = vmlaq_s16(out, bottom.val[0], minus_three);
80  // Bottom right
81  out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three);
82 
83  return out;
84 }
85 } // namespace
86 
88  : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr)
89 {
90 }
91 
92 void NEScharr3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
93 {
95  ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr));
96 
97  _run_scharr_x = output_x != nullptr;
98  _run_scharr_y = output_y != nullptr;
99 
100  if(_run_scharr_x)
101  {
103  }
104 
105  if(_run_scharr_y)
106  {
108  }
109 
110  _input = input;
111  _output_x = output_x;
112  _output_y = output_y;
113 
114  // Configure kernel window
115  constexpr unsigned int num_elems_processed_per_iteration = 8;
116  constexpr unsigned int num_elems_read_per_iteration = 16;
117  constexpr unsigned int num_elems_written_per_iteration = 8;
118  constexpr unsigned int num_rows_read_per_iteration = 3;
119 
120  Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size());
121  AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration);
122  AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration);
123 
125  AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration),
126  output_x_access,
127  output_y_access);
128 
129  output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
130  output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size());
131 
132  INEKernel::configure(win);
133 }
134 
136 {
137  return BorderSize(1);
138 }
139 
141 {
142  ARM_COMPUTE_UNUSED(info);
145 
146  const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1));
147  const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0));
148  const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1));
149 
150  Iterator input(_input, window);
151  Iterator output_y;
152  Iterator output_x;
153 
154  if(_run_scharr_y)
155  {
156  output_y = Iterator(_output_y, window);
157  }
158 
159  if(_run_scharr_x)
160  {
161  output_x = Iterator(_output_x, window);
162  }
163 
164  if(_run_scharr_x && _run_scharr_y)
165  {
166  execute_window_loop(window, [&](const Coordinates &)
167  {
168 
169  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
170  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
171  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
172 
173  const int16x8x2_t top_s16 =
174  {
175  {
176  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
177  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
178  }
179  };
180  const int16x8x2_t mid_s16 =
181  {
182  {
183  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
184  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
185  }
186  };
187  const int16x8x2_t bot_s16 =
188  {
189  {
190  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
191  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
192  }
193  };
194 
195  vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
196  vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
197  },
198  input, output_x, output_y);
199  }
200  else if(_run_scharr_x)
201  {
202  execute_window_loop(window, [&](const Coordinates &)
203  {
204 
205  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
206  const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
207  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
208 
209  const int16x8x2_t top_s16 =
210  {
211  {
212  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
213  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
214  }
215  };
216  const int16x8x2_t mid_s16 =
217  {
218  {
219  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))),
220  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data)))
221  }
222  };
223  const int16x8x2_t bot_s16 =
224  {
225  {
226  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
227  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
228  }
229  };
230 
231  vst1q_s16(reinterpret_cast<int16_t *>(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16));
232  },
233  input, output_x);
234  }
235  else if(_run_scharr_y)
236  {
237  execute_window_loop(window, [&](const Coordinates &)
238  {
239 
240  const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset());
241  const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
242 
243  const int16x8x2_t top_s16 =
244  {
245  {
246  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))),
247  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data)))
248  }
249  };
250  const int16x8x2_t bot_s16 =
251  {
252  {
253  vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))),
254  vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data)))
255  }
256  };
257 
258  vst1q_s16(reinterpret_cast<int16_t *>(output_y.ptr()), scharr_y(top_s16, bot_s16));
259  },
260  input, output_y);
261  }
262 }
unsigned int top
top of the border
Definition: Types.h:375
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
Definition: ITensor.h:63
Container for 2D border size.
Definition: Types.h:273
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
Implementation of a rectangular access pattern.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
unsigned int left
left of the border
Definition: Types.h:378
NEScharr3x3Kernel()
Default constructor.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
1 channel, 1 S16 per channel
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
unsigned int num_elems_processed_per_iteration
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined)
Initialise the kernel&#39;s source, destination and border.
constexpr size_t offset() const
Return the offset in bytes from the first element to the current position of the iterator.
Definition: Helpers.inl:134
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
BorderSize border_size() const override
The size of the border for that kernel.