49 template <
bool calc_sum_squared>
52 uint64x1_t
sum = vdup_n_u64(0);
53 uint64x1_t sum_squared = vdup_n_u64(0);
58 const uint8x16_t in_data = vld1q_u8(iterator.
ptr());
61 const uint16x8_t tmp0 = vaddl_u8(vget_low_u8(in_data), vget_high_u8(in_data));
62 const uint32x4_t tmp1 = vaddl_u16(vget_low_u16(tmp0), vget_high_u16(tmp0));
63 const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
66 sum = vpadal_u32(sum, tmp2);
70 const uint16x8_t square_data_low = vmull_u8(vget_low_u8(in_data), vget_low_u8(in_data));
71 const uint16x8_t square_data_high = vmull_u8(vget_high_u8(in_data), vget_high_u8(in_data));
74 const uint32x4_t tmp0_low = vaddl_u16(vget_low_u16(square_data_low), vget_high_u16(square_data_low));
75 const uint32x4_t tmp0_high = vaddl_u16(vget_low_u16(square_data_high), vget_high_u16(square_data_high));
76 const uint32x4_t tmp1 = vaddq_u32(tmp0_low, tmp0_high);
77 const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1));
80 sum_squared = vpadal_u32(sum_squared, tmp2);
85 return std::make_pair(sum, sum_squared);
90 : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx(), _border_size(0)
110 _global_sum = global_sum;
111 _global_sum_squared = global_sum_squared;
122 INEKernel::configure(win);
132 uint64x1_t local_sum = vdup_n_u64(0);
133 uint64x1_t local_sum_squared = vdup_n_u64(0);
135 if(_stddev !=
nullptr)
137 std::tie(local_sum, local_sum_squared) = accumulate<true>(
window,
input);
141 std::tie(local_sum, local_sum_squared) = accumulate<false>(
window,
input);
149 *_global_sum += vget_lane_u64(local_sum, 0);
151 const float mean = *_global_sum / num_pixels;
154 if(_stddev !=
nullptr)
156 const uint64_t tmp_sum_squared = vget_lane_u64(local_sum_squared, 0);
157 *_global_sum_squared += tmp_sum_squared;
158 *_stddev = std::sqrt((*_global_sum_squared / num_pixels) - (mean * mean));
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(t)
Container for 2D border size.
1 channel, 1 U8 per channel
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
BorderSize border_size() const override
The size of the border for that kernel.
std::unique_lock< Mutex > unique_lock
Wrapper of lock_guard data-object.
Interface for Neon tensor.
Copyright (c) 2017-2021 Arm Limited.
bool update_window_and_padding(Window &win, Ts &&... patterns)
Update window and padding size for each of the access patterns.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
auto ceil_to_multiple(S value, T divisor) -> decltype(((value+divisor - 1)/divisor) *divisor)
Computes the smallest number larger or equal to value that is a multiple of divisor.
Class to describe a number of elements in each dimension.
Implementation of a row access pattern.
void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev=nullptr, uint64_t *global_sum_squared=nullptr)
Initialise the kernel's input and outputs.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
constexpr uint8_t * ptr() const
Return a pointer to the current pixel.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
Information about executing thread and CPU.
unsigned int num_elems_processed_per_iteration
__kernel void accumulate(__global uchar *input_ptr, uint input_stride_x, uint input_step_x, uint input_stride_y, uint input_step_y, uint input_offset_first_element_in_bytes, __global uchar *accu_ptr, uint accu_stride_x, uint accu_step_x, uint accu_stride_y, uint accu_step_y, uint accu_offset_first_element_in_bytes)
This function accumulates an input image into output image.
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
NEMeanStdDevKernel()
Default constructor.
Iterator updated by execute_window_loop for each window element.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)