Compute Library
 21.02
NEChannelCombineKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
33 #include "arm_compute/core/Types.h"
38 
39 #include <arm_neon.h>
40 
41 using namespace arm_compute;
42 
43 namespace arm_compute
44 {
45 class Coordinates;
46 } // namespace arm_compute
47 
49  : _func(nullptr), _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }, _num_elems_processed_per_iteration(8),
50 _is_parallelizable(true)
51 {
52 }
53 
54 void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
55 {
56  ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
57  ARM_COMPUTE_ERROR_ON(plane0 == output);
58  ARM_COMPUTE_ERROR_ON(plane1 == output);
59  ARM_COMPUTE_ERROR_ON(plane2 == output);
60 
65 
69 
70  const Format output_format = output->info()->format();
71 
72  // Check if horizontal dimension of Y plane is even and validate horizontal sub-sampling dimensions for U and V planes
73  if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
74  {
75  // Validate Y plane of input and output
76  ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output);
77 
78  // Validate U and V plane of the input
79  ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
80  }
81 
82  _planes[0] = plane0;
83  _planes[1] = plane1;
84  _planes[2] = plane2;
85  _planes[3] = nullptr;
86 
87  // Validate the last input tensor only for RGBA format
88  if(Format::RGBA8888 == output_format)
89  {
92 
95 
96  _planes[3] = plane3;
97  }
98 
99  _output = output;
100  _output_multi = nullptr;
101 
102  // Half the processed elements for U and V channels due to horizontal sub-sampling of 2
103  if(Format::YUYV422 == output_format || Format::UYVY422 == output_format)
104  {
105  _x_subsampling[1] = 2;
106  _x_subsampling[2] = 2;
107  }
108 
109  _num_elems_processed_per_iteration = 8;
110  _is_parallelizable = true;
111 
112  // Select function and number of elements to process given the output format
113  switch(output_format)
114  {
115  case Format::RGB888:
116  _func = &NEChannelCombineKernel::combine_3C;
117  break;
118  case Format::RGBA8888:
119  _func = &NEChannelCombineKernel::combine_4C;
120  break;
121  case Format::UYVY422:
122  _num_elems_processed_per_iteration = 16;
123  _func = &NEChannelCombineKernel::combine_YUV_1p<true>;
124  break;
125  case Format::YUYV422:
126  _num_elems_processed_per_iteration = 16;
127  _func = &NEChannelCombineKernel::combine_YUV_1p<false>;
128  break;
129  default:
130  ARM_COMPUTE_ERROR("Not supported format.");
131  break;
132  }
133 
134  Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration));
135 
136  AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
137  AccessWindowHorizontal plane0_access(plane0->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[0]);
138  AccessWindowHorizontal plane1_access(plane1->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[1]);
139  AccessWindowHorizontal plane2_access(plane2->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[2]);
140  AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, _num_elems_processed_per_iteration);
141 
143  win,
144  plane0_access,
145  plane1_access,
146  plane2_access,
147  plane3_access,
148  output_access);
149 
151  plane1->info()->valid_region(),
152  plane2->info()->valid_region());
153 
154  if(plane3 != nullptr)
155  {
156  valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region);
157  }
158 
159  output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape()));
160 
161  INEKernel::configure(win);
162 }
163 
164 void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output)
165 {
166  ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output);
170 
175 
179 
180  const Format output_format = output->info()->format();
181 
182  // Validate shape of Y plane to be even and shape of sub-sampling dimensions for U and V planes
183  // Perform validation only for formats which require sub-sampling.
184  if(Format::YUV444 != output_format)
185  {
186  // Validate Y plane of input and output
187  ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(output_format, plane0, output->plane(0));
188 
189  // Validate U and V plane of the input
190  ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), plane1, plane2);
191 
192  // Validate second plane U (NV12 and NV21 have a UV88 combined plane while IYUV has only the U plane)
193  // MultiImage generates the correct tensor shape but also check in case the tensor shape of planes was changed to a wrong size
194  ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(1));
195 
196  // Validate the last plane V of format IYUV
197  if(Format::IYUV == output_format)
198  {
199  // Validate Y plane of the output
200  ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(output_format, plane0->info()->tensor_shape(), output->plane(2));
201  }
202  }
203 
204  _planes[0] = plane0;
205  _planes[1] = plane1;
206  _planes[2] = plane2;
207  _planes[3] = nullptr;
208  _output = nullptr;
209  _output_multi = output;
210 
211  bool has_two_planes = false;
212  unsigned int num_elems_written_plane1 = 8;
213 
214  _num_elems_processed_per_iteration = 8;
215  _is_parallelizable = true;
216 
217  switch(output_format)
218  {
219  case Format::NV12:
220  case Format::NV21:
221  _x_subsampling = { { 1, 2, 2 } };
222  _y_subsampling = { { 1, 2, 2 } };
223  _func = &NEChannelCombineKernel::combine_YUV_2p;
224  has_two_planes = true;
225  num_elems_written_plane1 = 16;
226  break;
227  case Format::IYUV:
228  _is_parallelizable = false;
229  _x_subsampling = { { 1, 2, 2 } };
230  _y_subsampling = { { 1, 2, 2 } };
231  _func = &NEChannelCombineKernel::combine_YUV_3p;
232  break;
233  case Format::YUV444:
234  _is_parallelizable = false;
235  _x_subsampling = { { 1, 1, 1 } };
236  _y_subsampling = { { 1, 1, 1 } };
237  _func = &NEChannelCombineKernel::combine_YUV_3p;
238  break;
239  default:
240  ARM_COMPUTE_ERROR("Not supported format.");
241  break;
242  }
243 
244  const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end());
245 
246  Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step));
247  AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]);
248  AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]);
249  AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]);
250 
252  AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration),
253  AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]),
254  AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]),
255  output_plane0_access,
256  output_plane1_access,
257  output_plane2_access);
258 
259  ValidRegion plane0_valid_region = plane0->info()->valid_region();
260  ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region();
261 
262  output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape()));
263  output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape()));
264  output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape()));
265 
266  INEKernel::configure(win);
267 }
268 
270 {
271  return _is_parallelizable;
272 }
273 
275 {
276  ARM_COMPUTE_UNUSED(info);
279  ARM_COMPUTE_ERROR_ON(_func == nullptr);
280 
281  (this->*_func)(window);
282 }
283 
284 void NEChannelCombineKernel::combine_3C(const Window &win)
285 {
286  Iterator p0(_planes[0], win);
287  Iterator p1(_planes[1], win);
288  Iterator p2(_planes[2], win);
289  Iterator out(_output, win);
290 
291  execute_window_loop(win, [&](const Coordinates &)
292  {
293  const auto p0_ptr = static_cast<uint8_t *>(p0.ptr());
294  const auto p1_ptr = static_cast<uint8_t *>(p1.ptr());
295  const auto p2_ptr = static_cast<uint8_t *>(p2.ptr());
296  const auto out_ptr = static_cast<uint8_t *>(out.ptr());
297 
298  const uint8x8x3_t pixels =
299  {
300  {
301  vld1_u8(p0_ptr),
302  vld1_u8(p1_ptr),
303  vld1_u8(p2_ptr)
304  }
305  };
306 
307  vst3_u8(out_ptr, pixels);
308  },
309  p0, p1, p2, out);
310 }
311 
312 void NEChannelCombineKernel::combine_4C(const Window &win)
313 {
314  Iterator p0(_planes[0], win);
315  Iterator p1(_planes[1], win);
316  Iterator p2(_planes[2], win);
317  Iterator p3(_planes[3], win);
318  Iterator out(_output, win);
319 
320  execute_window_loop(win, [&](const Coordinates &)
321  {
322  const auto p0_ptr = static_cast<uint8_t *>(p0.ptr());
323  const auto p1_ptr = static_cast<uint8_t *>(p1.ptr());
324  const auto p2_ptr = static_cast<uint8_t *>(p2.ptr());
325  const auto p3_ptr = static_cast<uint8_t *>(p3.ptr());
326  const auto out_ptr = static_cast<uint8_t *>(out.ptr());
327 
328  const uint8x8x4_t pixels =
329  {
330  {
331  vld1_u8(p0_ptr),
332  vld1_u8(p1_ptr),
333  vld1_u8(p2_ptr),
334  vld1_u8(p3_ptr)
335  }
336  };
337 
338  vst4_u8(out_ptr, pixels);
339  },
340  p0, p1, p2, p3, out);
341 }
342 
343 template <bool is_uyvy>
344 void NEChannelCombineKernel::combine_YUV_1p(const Window &win)
345 {
346  // Create sub-sampled uv window and init uv planes
347  Window win_uv(win);
348  win_uv.set_dimension_step(Window::DimX, win.x().step() / _x_subsampling[1]);
349  win_uv.validate();
350 
351  Iterator p0(_planes[0], win);
352  Iterator p1(_planes[1], win_uv);
353  Iterator p2(_planes[2], win_uv);
354  Iterator out(_output, win);
355 
356  constexpr auto shift = is_uyvy ? 1 : 0;
357 
358  execute_window_loop(win, [&](const Coordinates &)
359  {
360  const auto p0_ptr = static_cast<uint8_t *>(p0.ptr());
361  const auto p1_ptr = static_cast<uint8_t *>(p1.ptr());
362  const auto p2_ptr = static_cast<uint8_t *>(p2.ptr());
363  const auto out_ptr = static_cast<uint8_t *>(out.ptr());
364 
365  const uint8x8x2_t pixels_y = vld2_u8(p0_ptr);
366  const uint8x8x2_t pixels_uv =
367  {
368  {
369  vld1_u8(p1_ptr),
370  vld1_u8(p2_ptr)
371  }
372  };
373 
374  uint8x8x4_t pixels{ {} };
375  pixels.val[0 + shift] = pixels_y.val[0];
376  pixels.val[1 - shift] = pixels_uv.val[0];
377  pixels.val[2 + shift] = pixels_y.val[1];
378  pixels.val[3 - shift] = pixels_uv.val[1];
379 
380  vst4_u8(out_ptr, pixels);
381  },
382  p0, p1, p2, out);
383 }
384 
385 void NEChannelCombineKernel::combine_YUV_2p(const Window &win)
386 {
387  ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[1]);
388  ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[1]);
389 
390  // Copy first plane
391  copy_plane(win, 0);
392 
393  // Update UV window
394  Window uv_win(win);
395  uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], uv_win.x().step() / _x_subsampling[1]));
396  uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1));
397  uv_win.validate();
398 
399  // Update output win
400  Window out_win(win);
401  out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() / _x_subsampling[1]));
402  out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1));
403  out_win.validate();
404 
405  // Construct second plane
406  const int shift = (Format::NV12 == _output_multi->info()->format()) ? 0 : 1;
407  Iterator p1(_planes[1 + shift], uv_win);
408  Iterator p2(_planes[2 - shift], uv_win);
409  Iterator out(_output_multi->plane(1), out_win);
410 
411  // Increase step size after iterator is created to calculate stride correctly for multi channel format
412  out_win.set_dimension_step(Window::DimX, out_win.x().step() * _x_subsampling[1]);
413 
414  execute_window_loop(out_win, [&](const Coordinates &)
415  {
416  const uint8x8x2_t pixels =
417  {
418  {
419  vld1_u8(p1.ptr()),
420  vld1_u8(p2.ptr())
421  }
422  };
423 
424  vst2_u8(out.ptr(), pixels);
425  },
426  p1, p2, out);
427 }
428 
429 void NEChannelCombineKernel::combine_YUV_3p(const Window &win)
430 {
431  copy_plane(win, 0);
432  copy_plane(win, 1);
433  copy_plane(win, 2);
434 }
435 
436 void NEChannelCombineKernel::copy_plane(const Window &win, uint32_t plane_id)
437 {
438  ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[plane_id]);
439  ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[plane_id]);
440 
441  // Update window
442  Window tmp_win(win);
443  tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], tmp_win.x().step() / _x_subsampling[plane_id]));
444  tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1));
445 
446  Iterator in(_planes[plane_id], tmp_win);
447  Iterator out(_output_multi->plane(plane_id), tmp_win);
448 
449  execute_window_loop(tmp_win, [&](const Coordinates &)
450  {
451  const uint8x8_t pixels = vld1_u8(in.ptr());
452 
453  vst1_u8(out.ptr(), pixels);
454  },
455  in, out);
456 }
A single plane of 32-bit macro pixel of U0, Y0, V0, Y1 byte.
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
bool is_parallelisable() const override
Indicates whether or not the kernel is parallelisable.
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Definition: Validate.h:856
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
constexpr int step() const
Return the step of the dimension.
Definition: Window.h:104
1 channel, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
A 2 plane YUV format of Luma (Y) and interleaved UV data at 4:2:0 sampling.
Describe one of the image&#39;s dimensions with a start, end and step.
Definition: Window.h:77
void validate() const
Will validate all the window&#39;s dimensions&#39; values when asserts are enabled.
Definition: Window.inl:173
#define ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(t,...)
Definition: Validate.h:643
const ValidRegion valid_region
Definition: Scale.cpp:221
Interface for Neon tensor.
Definition: ITensor.h:36
A 2 plane YUV format of Luma (Y) and interleaved VU data at 4:2:0 sampling.
Copyright (c) 2017-2021 Arm Limited.
virtual ValidRegion valid_region() const =0
Valid region of the tensor.
3 channels, 1 U8 per channel
#define ARM_COMPUTE_ERROR_ON_TENSORS_NOT_EVEN(...)
Definition: Validate.h:318
Implementation of a rectangular access pattern.
virtual Format format() const =0
Colour format of the image.
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
Definition: WindowHelpers.h:46
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
Interface for multi-planar images.
Definition: IMultiImage.h:34
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Format
Image colour formats.
Definition: Types.h:54
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
Implementation of a row access pattern.
void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output)
Configure function&#39;s inputs and outputs.
A 3 plane of 8 bit 4:4:4 sampled Y, U, V planes.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
Definition: Helpers.inl:139
ValidRegion intersect_valid_regions(const Ts &... regions)
Intersect multiple valid regions.
Definition: WindowHelpers.h:74
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
Definition: Window.inl:49
__kernel void copy_plane(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_offset_first_element_in_bytes)
This function extracts a given plane from an multi-planar image.
virtual const MultiImageInfo * info() const =0
Interface to be implemented by the child class to return the multi-planar image&#39;s metadata...
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:790
A 3 plane of 8-bit 4:2:0 sampled Y, U, V planes.
static constexpr size_t DimY
Alias for dimension 1 also known as Y dimension.
Definition: Window.h:45
4 channels, 1 U8 per channel
void set_dimension_step(size_t dimension, int step)
Set the step of a given dimension.
Definition: Window.inl:167
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
Definition: CPPTypes.h:235
constexpr const Dimension & y() const
Alias to access the second dimension of the window.
Definition: Window.h:154
virtual IImage * plane(unsigned int index)=0
Return a pointer to the requested plane of the image.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
Container for valid region of a window.
Definition: Types.h:188
constexpr int end() const
Return the end of the dimension.
Definition: Window.h:99
A single plane of 32-bit macro pixel of Y0, U0, Y1, V0 bytes.
Iterator updated by execute_window_loop for each window element.
Definition: Helpers.h:46
Format format() const
Colour format of the image.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
constexpr int start() const
Return the start of the dimension.
Definition: Window.h:94
Describe a multidimensional execution window.
Definition: Window.h:39
Coordinates anchor
Anchor for the start of the valid region.
Definition: Types.h:260
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
#define ARM_COMPUTE_ERROR_ON_TENSORS_NOT_SUBSAMPLED(...)
Definition: Validate.h:351
constexpr const Dimension & x() const
Alias to access the first dimension of the window.
Definition: Window.h:145