24 #ifndef ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
25 #define ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H
57 template <
typename T,
unsigned int S,
unsigned int dim,
bool do_2D_norm>
67 const auto window_start_x =
static_cast<int>(window.
x().
start());
68 const auto window_end_x =
static_cast<int>(window.
x().
end());
69 const int window_step_x = S;
72 Iterator input_squared(in_squared, win);
88 auto sequential_normalization = [&](
const int x,
const Coordinates &id,
const int current_row,
const int first_row,
89 const int last_row,
const T *input_ptr,
const uint8_t *input_squared_start_ptr,
92 const int current_slice = dim == 0 ? x :
id[dim];
93 const int first_slice = std::max(current_slice - radius, 0);
94 const int last_slice = std::min(current_slice + radius, max_right);
96 const uint8_t *
const input_squared_x_ptr = input_squared_start_ptr + x * input_squared_stride_x;
98 auto accu =
static_cast<T
>(0.f);
99 for (
int j = first_row; j <= last_row; ++j)
102 const uint8_t *
const input_squared_ptr = input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
103 for (
int i = first_slice; i <= last_slice; ++i)
106 *
reinterpret_cast<const T *
>(input_squared_ptr + (i - current_slice) * input_squared_stride_slice);
111 const auto normalized =
112 std::pow(accu *
static_cast<T
>(ninfo.
scale_coeff()) +
static_cast<T
>(ninfo.
kappa()), ninfo.
beta());
113 const auto normalized_pixel = (*(input_ptr + x)) / normalized;
114 *(output_ptr + x) = normalized_pixel;
121 const auto input_ptr =
reinterpret_cast<const T *
>(
input.ptr());
122 auto output_ptr =
reinterpret_cast<T *
>(output.
ptr());
125 const int current_row = do_2D_norm ?
id[dim_y] : 0;
126 const int first_row = do_2D_norm ? std::max(current_row - radius, 0) : 0;
127 const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0;
129 int x = window_start_x;
131 for (; x < radius && x < window_end_x && dim == 0; ++x)
133 sequential_normalization(x,
id, current_row, first_row, last_row, input_ptr, input_squared.
ptr(),
138 for (; x <= window_end_x - window_step_x - radius; x += window_step_x)
140 const int current_slice = dim == 0 ? x :
id[dim];
141 const int first_slice = std::max(current_slice - radius, 0);
142 const int last_slice = std::min(current_slice + radius, max_right);
144 const uint8_t *
const input_squared_x_ptr = input_squared.
ptr() + x * input_squared_stride_x;
147 for (
int j = first_row; j <= last_row; ++j)
150 const uint8_t *
const input_squared_ptr =
151 input_squared_x_ptr + (j - current_row) * input_squared_stride_row;
152 for (
int i = first_slice; i <= last_slice; ++i)
156 input_squared_ptr + (i - current_slice) * input_squared_stride_slice)));
163 wrapper::vstore(
reinterpret_cast<T *
>(output_ptr + x), normalized_pixel);
167 for (; x < window_end_x; ++x)
169 sequential_normalization(x,
id, current_row, first_row, last_row, input_ptr, input_squared.
ptr(),
173 input, input_squared, output);
177 #endif // ACL_SRC_CPU_KERNELS_NORM_LAYER_GENERIC_NEON_IMPL_H