49 struct BatchNormalizationSelectorData
53 using BatchNormalizationSelectorPtr = std::add_pointer<bool(
const BatchNormalizationSelectorData &data)>
::type;
54 using BatchNormalizationKernelPtr = std::add_pointer<void(ITensor *, ITensor *,
const ITensor *,
const ITensor *,
const ITensor *,
const ITensor *,
55 float, ActivationLayerInfo &,
const Window &)>
::type;
57 struct BatchNormalizationKernel
64 static const BatchNormalizationKernel available_kernels[] =
66 #if defined(__ARM_FEATURE_SVE) 68 "fp16_sve_batch_normalization",
69 [](
const BatchNormalizationSelectorData & data) {
return data.dt ==
DataType::F16; },
73 "f32_sve_batch_normalization",
74 [](
const BatchNormalizationSelectorData & data) {
return data.dt ==
DataType::F32; },
78 #if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) 80 "fp16_neon_batch_normalization",
81 [](
const BatchNormalizationSelectorData & data) {
return data.dt ==
DataType::F16; },
86 "f32_neon_batch_normalization",
87 [](
const BatchNormalizationSelectorData & data) {
return data.dt ==
DataType::F32; },
93 const BatchNormalizationKernel *get_implementation(
const BatchNormalizationSelectorData &data)
95 for(
const auto &uk : available_kernels)
97 if(uk.is_selected(data))
106 validate_arguments(
const ITensorInfo *
input,
const ITensorInfo *output,
const ITensorInfo *mean,
const ITensorInfo *var,
107 const ITensorInfo *beta,
const ITensorInfo *gamma,
float epsilon, ActivationLayerInfo act_info)
111 const auto *uk = get_implementation(BatchNormalizationSelectorData{
input->data_type() });
114 if(act_info.enabled())
118 && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU
119 && act != ActivationLayerInfo::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU);
123 if(
nullptr != output)
148 template <
typename T,
bool fused_activation,
typename F>
149 void NEBatchNormalizationLayerKernel::batch_normalization_nchw(
const Window &window)
152 using ExactTagType =
typename wrapper::traits::neon_bitvector_tag_t<T, wrapper::traits::BitWidth::W128>;
154 const int window_step_x = 16 /
sizeof(T);
155 const auto window_start_x = static_cast<int>(
window.
x().
start());
156 const auto window_end_x = static_cast<int>(
window.
x().
end());
158 Window win_to_use =
window;
161 Iterator
input(_input, win_to_use);
162 Iterator output(_output, win_to_use);
164 F activation_functor(_act_info);
170 const auto input_mean = reinterpret_cast<const T *>(_mean->
ptr_to_element(Coordinates(0, 0)));
171 const auto input_var = reinterpret_cast<const T *>(_var->
ptr_to_element(Coordinates(0, 0)));
172 const auto input_gamma = (_gamma !=
nullptr) ? reinterpret_cast<const T *>(_gamma->
ptr_to_element(Coordinates(0, 0))) :
nullptr;
173 const auto input_beta = (_beta !=
nullptr) ? reinterpret_cast<const T *>(_beta->
ptr_to_element(Coordinates(0, 0))) :
nullptr;
175 T mean = static_cast<T>(0);
176 T var = static_cast<T>(0);
177 T gamma = static_cast<T>(1);
178 T beta = static_cast<T>(0);
179 T denominator = static_cast<T>(0);
186 const auto epsilon_vec =
wrapper::vdup_n(static_cast<T>(_epsilon), ExactTagType{});
189 const auto input_ptr = reinterpret_cast<const T *>(
input.ptr());
190 const auto output_ptr = reinterpret_cast<T *>(output.ptr());
194 mean = input_mean[
id.z()];
195 var = input_var[
id.z()];
198 if(input_gamma !=
nullptr)
200 gamma = input_gamma[
id.z()];
203 if(input_beta !=
nullptr)
205 beta = input_beta[
id.z()];
216 int x = window_start_x;
217 for(; x <= (window_end_x - window_step_x); x += window_step_x)
221 const auto x_bar =
wrapper::vmul(numerator, denominator_vec);
227 activation_functor(res);
235 for(; x < window_end_x; ++x)
237 const T numerator = input_ptr[x] - mean;
238 const T x_bar = numerator * denominator;
239 T res = beta + x_bar * gamma;
244 activation_functor(res);
248 *(output_ptr + x) = res;
254 void NEBatchNormalizationLayerKernel::configure_non_fused()
258 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 260 _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float16_t, false, detail::dummy<float16_t, 8>>;
262 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 264 _func = &NEBatchNormalizationLayerKernel::batch_normalization_nchw<float, false, detail::dummy<float, 4>>;
272 void NEBatchNormalizationLayerKernel::configure_fused()
275 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f32_nchw =
281 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 283 static std::map<ActivationLayerInfo::ActivationFunction, BatchNormFunctionPtr> bn_fused_map_f16_nchw =
289 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 293 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 295 _func = bn_fused_map_f16_nchw[_act_info.
activation()];
297 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 299 _func = bn_fused_map_f32_nchw[_act_info.
activation()];
308 : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon(), _act_info()
321 (beta !=
nullptr) ? beta->
info() :
nullptr,
322 (gamma !=
nullptr) ? gamma->
info() :
nullptr,
332 _act_info = act_info;
334 const bool run_in_place = (output ==
nullptr) || (output ==
input);
350 configure_non_fused();
356 INEKernel::configure(win);
358 if(output !=
nullptr)
389 const auto *uk = get_implementation(BatchNormalizationSelectorData{ _input->
info()->
data_type() });
390 uk->ukernel(_input, _output, _mean, _var, _beta, _gamma, _epsilon, _act_info,
window);
Window calculate_max_window(const ValidRegion &valid_region, const Steps &steps, bool skip_border, BorderSize border_size)
const Window & window() const
The maximum window the kernel can be executed on.
uint8_t * ptr_to_element(const Coordinates &id) const
Return a pointer to the element at the passed coordinates.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(...)
#define REGISTER_FP16_NEON(func_name)
bool enabled() const
Check if initialised.
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
#define REGISTER_FP32_NEON(func_name)
float32x2_t vinvsqrt(const float32x2_t &a)
uint8x16_t vloadq(const uint8_t *ptr)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
uint8x8_t vadd(const uint8x8_t &a, const uint8x8_t &b)
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *mean, const ITensorInfo *var, const ITensorInfo *beta=nullptr, const ITensorInfo *gamma=nullptr, float epsilon=0.001f, ActivationLayerInfo act_info=ActivationLayerInfo())
Static function to check if given info will lead to a valid configuration of NEBatchNormalizationLaye...
1 channel, 1 F32 per channel
#define REGISTER_FP32_SVE(func_name)
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
Store the tensor's metadata.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
uint8x8_t vsub(const uint8x8_t &a, const uint8x8_t &b)
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Activation Layer Information class.
decltype(strategy::transforms) typedef type
Interface for CPU tensor.
Copyright (c) 2017-2021 Arm Limited.
ActivationFunction
Available activation functions.
1 channel, 1 F16 per channel
NEBatchNormalizationLayerKernel()
Default constructor.
void run(const Window &window, const ThreadInfo &info) override
Execute the kernel on the passed window.
uint8_t vgetlane(const uint8x8_t vector, const unsigned int lane)
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Class to describe a number of elements in each dimension.
const BatchNormalizationSelectorPtr is_selected
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
void set(size_t dimension, const Dimension &dim)
Set the values of a given dimension.
#define ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(k)
Num samples, channels, height, width.
void fp32_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
Lower and Upper Bounded Rectifier ( )
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
void fp16_neon_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
uint8x8_t vmul(const uint8x8_t &a, const uint8x8_t &b)
Upper Bounded Rectifier ( )
Information about executing thread and CPU.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(...)
#define REGISTER_FP16_SVE(func_name)
BatchNormalizationKernelPtr ukernel
void fp16_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void vstore(uint8_t *ptr, uint8x8_t val)
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
uint8x8_t vdup_n(uint8_t value, traits::vector_64_tag)
void execute_window_loop(const Window &w, L &&lambda_function, Ts &&... iterators)
Iterate through the passed window, automatically adjusting the iterators and calling the lambda_funct...
void configure(ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta=nullptr, const ITensor *gamma=nullptr, float epsilon=0.001f, ActivationLayerInfo act_info=ActivationLayerInfo())
Set the input and output tensors.
ActivationFunction activation() const
Get the type of activation function.
Includes all wrapper headers at once.
uint8x8_t vmla(const uint8x8_t &a, const uint8x8_t &b, const uint8x8_t &c)
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
constexpr int end() const
Return the end of the dimension.
DataType
Available data types.
constexpr int start() const
Return the start of the dimension.
Describe a multidimensional execution window.
#define ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(f, s)
void fp32_sve_batch_normalization(ITensor *src, ITensor *dst, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, ActivationLayerInfo &act_info, const Window &window)
SimpleTensor< T > slice(const SimpleTensor< T > &src, Coordinates starts, Coordinates ends)
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
constexpr const Dimension & x() const
Alias to access the first dimension of the window.