Compute Library
 21.02
NEBatchNormalizationLayerKernel.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
28 #include "arm_compute/core/Utils.h"
31 #include "src/core/CPP/Validate.h"
33 #include "src/core/NEON/NEMath.h"
36 
39 
42 
43 #include <map>
44 
45 namespace arm_compute
46 {
47 namespace
48 {
49 struct BatchNormalizationSelectorData
50 {
52 };
54 using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *, ITensor *, const ITensor *, const ITensor *, const ITensor *, const ITensor *,
55  float, ActivationLayerInfo &, const Window &)>::type;
56 
57 struct BatchNormalizationKernel
58 {
59  const char *name;
60  const BatchNormalizationSelectorPtr is_selected;
61  BatchNormalizationKernelPtr ukernel;
62 };
63 
64 static const BatchNormalizationKernel available_kernels[] =
65 {
66 #if defined(__ARM_FEATURE_SVE)
67  {
68  "fp16_sve_batch_normalization",
69  [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
71  },
72  {
73  "f32_sve_batch_normalization",
74  [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
76  },
77 #else /* !defined(__ARM_FEATURE_SVE) */
78 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
79  {
80  "fp16_neon_batch_normalization",
81  [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F16; },
83  },
84 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
85  {
86  "f32_neon_batch_normalization",
87  [](const BatchNormalizationSelectorData & data) { return data.dt == DataType::F32; },
89  },
90 #endif /* !defined(__ARM_FEATURE_SVE) */
91 };
92 
93 const BatchNormalizationKernel *get_implementation(const BatchNormalizationSelectorData &data)
94 {
95  for(const auto &uk : available_kernels)
96  {
97  if(uk.is_selected(data))
98  {
99  return &uk;
100  }
101  }
102  return nullptr;
103 }
104 
105 Status
106 validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var,
107  const ITensorInfo *beta, const ITensorInfo *gamma, float epsilon, ActivationLayerInfo act_info)
108 {
109  ARM_COMPUTE_UNUSED(epsilon);
110 
111  const auto *uk = get_implementation(BatchNormalizationSelectorData{ input->data_type() });
112  ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
113 
114  if(act_info.enabled())
115  {
116  ActivationLayerInfo::ActivationFunction act = act_info.activation();
117  ARM_COMPUTE_RETURN_ERROR_ON(act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::RELU
118  && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
119  && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
120  ARM_COMPUTE_RETURN_ERROR_ON(act_info.b() > act_info.a());
121  }
122 
123  if(nullptr != output)
124  {
128  }
129 
132  if(beta != nullptr)
133  {
136  }
137  if(gamma != nullptr)
138  {
141  }
142  ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) != mean->dimension(0));
143 
144  return Status{};
145 }
146 } //namespace
147 
148 template <typename T, bool fused_activation, typename F>
149 void NEBatchNormalizationLayerKernel::batch_normalization_nchw(const Window &window)
150 {
151  /** Neon vector tag type. */
152  using ExactTagType = typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
153 
154  const int window_step_x = 16 / sizeof(T);
155  const auto window_start_x = static_cast<int>(window.x().start());
156  const auto window_end_x = static_cast<int>(window.x().end());
157 
158  Window win_to_use = window;
159  win_to_use.set(Window::DimX, Window::Dimension(0, 1, 1));
160 
161  Iterator input(_input, win_to_use);
162  Iterator output(_output, win_to_use);
163 
164  F activation_functor(_act_info);
165 
166  // Hold information about the current feature map we are iterating.
167  // Only compute denominator and Neon vectors once per feature map.
168  int slice = -1;
169 
170  const auto input_mean = reinterpret_cast<const T *>(_mean->ptr_to_element(Coordinates(0, 0)));
171  const auto input_var = reinterpret_cast<const T *>(_var->ptr_to_element(Coordinates(0, 0)));
172  const auto input_gamma = (_gamma != nullptr) ? reinterpret_cast<const T *>(_gamma->ptr_to_element(Coordinates(0, 0))) : nullptr;
173  const auto input_beta = (_beta != nullptr) ? reinterpret_cast<const T *>(_beta->ptr_to_element(Coordinates(0, 0))) : nullptr;
174 
175  T mean = static_cast<T>(0);
176  T var = static_cast<T>(0);
177  T gamma = static_cast<T>(1);
178  T beta = static_cast<T>(0);
179  T denominator = static_cast<T>(0);
180 
181  auto mean_vec = wrapper::vdup_n(mean, ExactTagType{});
182  auto var_vec = wrapper::vdup_n(var, ExactTagType{});
183  auto gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
184  auto beta_vec = wrapper::vdup_n(beta, ExactTagType{});
185  auto denominator_vec = wrapper::vdup_n(denominator, ExactTagType{});
186  const auto epsilon_vec = wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
187  execute_window_loop(win_to_use, [&](const Coordinates & id)
188  {
189  const auto input_ptr = reinterpret_cast<const T *>(input.ptr());
190  const auto output_ptr = reinterpret_cast<T *>(output.ptr());
191 
192  if(slice != id.z())
193  {
194  mean = input_mean[id.z()];
195  var = input_var[id.z()];
196  mean_vec = wrapper::vdup_n(mean, ExactTagType{});
197  var_vec = wrapper::vdup_n(var, ExactTagType{});
198  if(input_gamma != nullptr)
199  {
200  gamma = input_gamma[id.z()];
201  gamma_vec = wrapper::vdup_n(gamma, ExactTagType{});
202  }
203  if(input_beta != nullptr)
204  {
205  beta = input_beta[id.z()];
206  beta_vec = wrapper::vdup_n(beta, ExactTagType{});
207  }
208 
209  // Calculate denominator
210  denominator_vec = wrapper::vinvsqrt(wrapper::vadd(var_vec, epsilon_vec));
211  denominator = wrapper::vgetlane(denominator_vec, 0);
212  slice = id.z();
213  }
214 
215  // Perform core calculations using vector operations
216  int x = window_start_x;
217  for(; x <= (window_end_x - window_step_x); x += window_step_x)
218  {
219  // Calculate x bar
220  const auto numerator = wrapper::vsub(wrapper::vloadq(input_ptr + x), mean_vec);
221  const auto x_bar = wrapper::vmul(numerator, denominator_vec);
222  auto res = wrapper::vmla(beta_vec, x_bar, gamma_vec);
223 
224  // Perform fused activation
225  if(fused_activation)
226  {
227  activation_functor(res);
228  }
229 
230  // Store results
231  wrapper::vstore(output_ptr + x, res);
232  }
233 
234  // Compute left-over elements
235  for(; x < window_end_x; ++x)
236  {
237  const T numerator = input_ptr[x] - mean;
238  const T x_bar = numerator * denominator;
239  T res = beta + x_bar * gamma;
240 
241  // Perform fused activation
242  if(fused_activation)
243  {
244  activation_functor(res);
245  }
246 
247  // Store results
248  *(output_ptr + x) = res;
249  }
250  },
251  input, output);
252 }
253 
254 void NEBatchNormalizationLayerKernel::configure_non_fused()
255 {
256  switch(_input->info()->data_type())
257  {
258 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
259  case DataType::F16:
260  _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
261  break;
262 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
263  case DataType::F32:
264  _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
265  break;
266  default:
267  ARM_COMPUTE_ERROR("Element size not supported");
268  break;
269  }
270 }
271 
272 void NEBatchNormalizationLayerKernel::configure_fused()
273 {
274  // NCHW Fused Batched Normalization with activation functions : FP32
275  static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
276  {
277  { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::relu<float, 4>> },
278  { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::brelu<float, 4>> },
279  { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, true, detail::lubrelu<float, 4>> }
280  };
281 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
282  // NCHW Fused Batched Normalization with activation functions : FP16
283  static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
284  {
285  { ActivationLayerInfo::ActivationFunction::RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::relu<float16_t, 8>> },
286  { ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::brelu<float16_t, 8>> },
287  { ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, true, detail::lubrelu<float16_t, 8>> }
288  };
289 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
290 
291  switch(_input->info()->data_type())
292  {
293 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
294  case DataType::F16:
295  _func = bn_fused_map_f16_nchw[_act_info.activation()];
296  break;
297 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
298  case DataType::F32:
299  _func = bn_fused_map_f32_nchw[_act_info.activation()];
300  break;
301  default:
302  ARM_COMPUTE_ERROR("Element size not supported");
303  break;
304  }
305 }
306 
308  : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
309 {
310 }
311 
313  const ITensor *mean, const ITensor *var,
314  const ITensor *beta, const ITensor *gamma,
315  float epsilon, ActivationLayerInfo act_info)
316 {
317  ARM_COMPUTE_ERROR_ON_NULLPTR(input, mean, var);
318 
319  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr,
320  mean->info(), var->info(),
321  (beta != nullptr) ? beta->info() : nullptr,
322  (gamma != nullptr) ? gamma->info() : nullptr,
323  epsilon, act_info));
324 
325  _input = input;
326  _output = input;
327  _mean = mean;
328  _var = var;
329  _gamma = gamma;
330  _beta = beta;
331  _epsilon = epsilon;
332  _act_info = act_info;
333 
334  const bool run_in_place = (output == nullptr) || (output == input);
335  if(!run_in_place)
336  {
337  _output = output;
338  }
339 
340  // Configure activation function to run
341  const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
342  if(is_nchw)
343  {
344  if(_act_info.enabled())
345  {
346  configure_fused();
347  }
348  else
349  {
350  configure_non_fused();
351  }
352  }
353 
354  // Configure kernel window
355  Window win = calculate_max_window(*input->info(), Steps());
356  INEKernel::configure(win);
357 
358  if(output != nullptr)
359  {
360  // Output auto initialization if not yet initialized
361  auto_init_if_empty(*output->info(), *input->info()->clone());
362 
363  Coordinates coord;
364  coord.set_num_dimensions(output->info()->num_dimensions());
365  output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
366  }
367 }
368 
370  const ITensorInfo *mean, const ITensorInfo *var,
371  const ITensorInfo *beta, const ITensorInfo *gamma,
372  float epsilon, ActivationLayerInfo act_info)
373 {
374  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, mean, var, beta, gamma, epsilon, act_info));
375 
376  return Status{};
377 }
378 
380 {
381  ARM_COMPUTE_UNUSED(info);
384  ARM_COMPUTE_ERROR_ON(_func == nullptr && _input->info()->data_layout() == DataLayout::NCHW);
385 
386  const bool is_nchw = _input->info()->data_layout() == DataLayout::NCHW;
387  if(is_nchw)
388  {
389  (this->*_func)(window);
390  }
391  else
392  {
393  const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->info()->data_type() });
394  uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info, window);
395  }
396 }
397 } // namespace arm_compute
virtual size_t num_dimensions() const =0
The number of dimensions of the tensor (rank)
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
Definition: IKernel.cpp:28
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
Definition: Validate.h:494
#define REGISTER_FP16_NEON(func_name)
Definition: Registrars.h:42
bool enabled() const
Check if initialised.
Definition: Types.h:1600
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
#define REGISTER_FP32_NEON(func_name)
Definition: Registrars.h:52
float32x2_t vinvsqrt(const float32x2_t &a)
Definition: invsqrt.h:47
uint8x16_t vloadq(const uint8_t *ptr)
Definition: load.h:58
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
Definition: add.h:39
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta=nullptr, const ITensorInfo *gamma=nullptr, float epsilon=0.001f, ActivationLayerInfo act_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration of NEBatchNormalizationLaye...
1 channel, 1 F32 per channel
#define REGISTER_FP32_SVE(func_name)
Definition: Registrars.h:53
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Definition: Error.h:466
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
Definition: sub.h:39
Status class.
Definition: Error.h:52
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Activation Layer Information class.
Definition: Types.h:1550
decltype(strategy::transforms) typedef type
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
virtual void set_valid_region(const ValidRegion &valid_region)=0
Set the valid region of the tensor.
ActivationFunction
Available activation functions.
Definition: Types.h:1554
1 channel, 1 F16 per channel
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
Definition: getlane.h:91
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
Class to describe a number of elements in each dimension.
Definition: Steps.h:40
Coordinates of an item.
Definition: Coordinates.h:37
const BatchNormalizationSelectorPtr is_selected
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
const char * name
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Definition: Validate.h:941
Num samples, channels, height, width.
void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
Definition: fp32.cpp:136
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Definition: mul.h:39
Information about executing thread and CPU.
Definition: CPPTypes.h:235
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
Definition: Validate.h:443
#define REGISTER_FP16_SVE(func_name)
Definition: Registrars.h:43
BatchNormalizationKernelPtr ukernel
void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void vstore(uint8_t *ptr, uint8x8_t val)
Definition: store.h:39
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
Definition: dup_n.h:41
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
Definition: Helpers.inl:77
void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta=nullptr, const ITensor *gamma=nullptr, float epsilon=0.001f, ActivationLayerInfo act_info=ActivationLayerInfo())
Set the input and output tensors.
void set_num_dimensions(size_t num_dimensions)
Set number of dimensions.
Definition: Dimensions.h:149
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
Definition: mla.h:46
Container for valid region of a window.
Definition: Types.h:188
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
DataType
Available data types.
Definition: Types.h:77
Describe a multidimensional execution window.
Definition: Window.h:39
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
Definition: Validate.h:205
void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.