38 #include "src/core/NEON/kernels/assembly/winograd.hpp"
39 #include "src/core/NEON/kernels/convolution/common/tensor.hpp"
40 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
58 inline Tensor4DShape internal_get_shape(
const ITensorInfo *in)
66 return Tensor4DShape{in_batches, in_height, in_width, in_channels};
70 const ITensorInfo *weights,
71 const ITensorInfo *biases,
72 const ITensorInfo *
dst,
79 "Winograd layer only supports unit strides.");
80 if (biases !=
nullptr)
90 bool get_winograd_kernel_implementation(
const ITensorInfo *
src,
91 const ITensorInfo *weights,
92 const ITensorInfo *
dst,
95 bool enable_fast_math,
96 arm_conv::winograd::WinogradImpl *winograd_impl,
97 std::unique_ptr<arm_conv::ConvolutionArgs> &conv_args)
99 arm_conv::winograd::WinogradConfig winograd_cfg;
103 Tensor4DShape in_shape{internal_get_shape(
src)};
104 Tensor4DShape out_shape{internal_get_shape(
dst)};
105 Tensor4DShape kernel_shape{internal_get_shape(weights)};
108 winograd_cfg.output_rows = 0;
109 winograd_cfg.output_cols = 0;
110 conv_args = std::make_unique<arm_conv::ConvolutionArgs>(
112 arm_conv::Shape2D{static_cast<uint32_t>(in_shape.n_rows), static_cast<uint32_t>(in_shape.n_cols)},
114 arm_conv::Shape2D{static_cast<uint32_t>(out_shape.n_rows), static_cast<uint32_t>(out_shape.n_cols)},
115 out_shape.n_channels,
116 arm_conv::Shape2D{static_cast<uint32_t>(kernel_shape.n_rows), static_cast<uint32_t>(kernel_shape.n_cols)},
119 bool success =
false;
122 success = arm_conv::winograd::get_implementation<float>(*winograd_impl, &
CPUInfo::get(), *conv_args, nthreads,
123 enable_fast_math, &winograd_cfg,
nullptr);
125 #if defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
128 success = arm_conv::winograd::get_implementation<__fp16>(*winograd_impl, &
CPUInfo::get(), *conv_args, nthreads,
129 enable_fast_math, &winograd_cfg,
nullptr);
131 #endif // defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)
138 inline bool fuse_function_supported(
const ActivationLayerInfo &
act_info)
140 return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU ||
141 act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
147 : _gemm_function(std::make_unique<
CpuGemm>()),
149 _transform_input_kernel(nullptr),
150 _transform_output_kernel(nullptr),
151 _permute_input(std::make_unique<
CpuPermute>()),
152 _permute_output(std::make_unique<
CpuPermute>()),
153 _permute_weights(std::make_unique<
CpuPermute>()),
154 _aux_mem(AuxTensorIdx::Count),
158 _winograd_transformed_input{},
159 _winograd_transformed_output{},
160 _winograd_transformed_weights{},
167 _run_activation{
false}
179 bool enable_fast_math)
187 _data_layout =
src->data_layout();
188 const Tensor4DShape kernel_shape{internal_get_shape(weights)};
191 &_winograd_impl, _conv_args);
194 kernel_shape.n_cols);
196 _winograd_impl.input_transform->get_name().c_str());
198 _winograd_impl.input_transform->get_name().c_str());
200 _winograd_impl.input_transform->get_name().c_str());
202 const bool has_impl = ((_winograd_impl.input_transform !=
nullptr) &&
203 (_winograd_impl.output_transform !=
nullptr) && (_winograd_impl.gemm_args !=
nullptr));
207 const size_t input_workspace_size =
208 _winograd_impl.input_transform->get_working_space_size(*_conv_args, nthreads);
209 const size_t output_workspace_size =
210 _winograd_impl.output_transform->get_working_space_size(*_conv_args, nthreads);
214 _input_workspace = input_workspace_info;
215 _output_workspace = output_workspace_info;
217 const auto &wds = _winograd_impl.winograd_spec;
220 const size_t data_type_size =
src->element_size();
221 const uint32_t m = _winograd_impl.gemm_args->_Msize;
222 const uint32_t k = _winograd_impl.gemm_args->_Ksize;
223 const uint32_t n = _winograd_impl.gemm_args->_Nsize;
224 const uint32_t n_gemms = _winograd_impl.gemm_args->_nmulti;
225 const uint32_t n_batches = _winograd_impl.gemm_args->_nbatches;
226 constexpr
size_t storage_alignment = 64;
228 const TensorShape a_shape(k, m, n_batches, n_gemms);
229 Strides a_strides(data_type_size);
230 a_strides.
set(1, data_type_size * _winograd_impl.winograd_spec.input_ld_row);
231 a_strides.
set(2, data_type_size * _winograd_impl.winograd_spec.input_ld_batch);
232 a_strides.
set(3, data_type_size * _winograd_impl.winograd_spec.input_ld_matrix);
235 Strides b_strides(data_type_size);
236 b_strides.
set(1, data_type_size * _winograd_impl.winograd_spec.weight_ld_row);
237 b_strides.
set(2, data_type_size * _winograd_impl.winograd_spec.weight_ld_matrix);
239 const TensorShape d_shape(n, m, n_batches, n_gemms);
240 Strides d_strides(data_type_size);
241 d_strides.
set(1, data_type_size * _winograd_impl.winograd_spec.output_ld_row);
242 d_strides.
set(2, data_type_size * _winograd_impl.winograd_spec.output_ld_batch);
243 d_strides.
set(3, data_type_size * _winograd_impl.winograd_spec.output_ld_matrix);
248 a_info.
init(a_shape, 1,
data_type, a_strides, 0, wds.input_matrix_size_bytes);
249 b_info.init(b_shape, 1,
data_type, b_strides, 0, wds.weight_matrix_size_bytes);
250 d_info.init(d_shape, 1,
data_type, d_strides, 0, wds.output_matrix_size_bytes);
252 _winograd_transformed_input = a_info;
253 _winograd_transformed_weights = b_info;
254 _winograd_transformed_output = d_info;
266 _permute_weights->configure(weights, &_weights_hwio, weights_permutation_vector);
279 _transform_input_kernel =
280 std::make_unique<CpuWinogradConv2dTransformInputKernel>(_winograd_impl, *_conv_args, nthreads);
283 _gemm_function->configure(&_winograd_transformed_input, &_winograd_transformed_weights,
nullptr,
284 &_winograd_transformed_output, 1.0f, 0.f);
287 _transform_output_kernel =
288 std::make_unique<CpuWinogradConv2dTransformOutputKernel>(_winograd_impl, *_conv_args, nthreads);
294 _activation_func->configure(
dst,
nullptr,
act_info);
297 const auto mm_mem_req = _gemm_function->workspace();
298 for (
unsigned int slot = 0; slot < mm_mem_req.size(); ++slot)
300 _aux_mem[slot] = mm_mem_req[slot];
305 wds.input_matrix_size_bytes, storage_alignment);
307 wds.output_matrix_size_bytes, storage_alignment);
309 std::max(input_workspace_size, output_workspace_size));
310 _aux_mem[PermutedWeights] =
313 wds.weight_matrix_size_bytes, storage_alignment);
317 _aux_mem[PermutedOutput].merge(
offset_int_vec(PermutedOutput),
dst->total_size());
327 bool enable_fast_math)
333 if (!enable_fast_math)
338 const Tensor4DShape kernel_shape{internal_get_shape(weights)};
339 arm_conv::winograd::WinogradImpl winograd_impl{};
341 std::unique_ptr<arm_conv::ConvolutionArgs> conv_args;
342 const bool success = get_winograd_kernel_implementation(
src, weights,
dst,
conv_info,
act_info, enable_fast_math,
343 &winograd_impl, conv_args);
346 kernel_shape.n_cols);
348 winograd_impl.input_transform->get_name().c_str());
350 winograd_impl.input_transform->get_name().c_str());
352 winograd_impl.input_transform->get_name().c_str());
379 _permute_input->run(
pack);
401 _gemm_function->run(gemm_pack);
405 {
ACL_DST, is_nchw ? output_nhwc.
get() : output},
413 _permute_output->run(
pack);
418 _activation_func->run(
pack);
432 _permute_weights->run(permute_tensors);
439 const int permuted_weight_row_stride =
441 const int permuted_weight_col_stride =
443 const int permuted_weight_channel_stride =
450 CpuAuxTensorHandler winograd_transformed_weights(_winograd_transformed_weights, *weights_transf);
452 const void *permuted_weights_ptr;
453 void *win_wght_transf_ptr;
455 permuted_weights_ptr =
reinterpret_cast<const void *
>(
457 win_wght_transf_ptr =
458 reinterpret_cast<void *
>(winograd_transformed_weights.
get()->
buffer() +
462 _winograd_impl.weight_transform->execute(
463 *_conv_args, permuted_weights_ptr, permuted_weight_row_stride, permuted_weight_col_stride,
464 permuted_weight_channel_stride, win_wght_transf_ptr, _winograd_impl.winograd_spec, 0, 1
468 _gemm_function->prepare(gemm_pack);