Compute Library
 21.02
NEWinogradConvolutionLayer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
26 #include "arm_compute/core/Error.h"
27 #include "arm_compute/core/Utils.h"
31 #include "src/core/CPP/Validate.h"
38 
39 #include "src/core/NEON/kernels/convolution/common/utils.hpp"
40 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
41 
42 namespace arm_compute
43 {
44 namespace
45 {
46 inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
47  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
48 {
51 
52  if(input->data_type() == DataType::F32)
53  {
54  if(input_dims.width > 4 && input_dims.height > 4)
55  {
59  }
60  else
61  {
65  }
66  }
67 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
68  else if(input->data_type() == DataType::F16)
69  {
73  }
74 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
75 
76  if(act_info.enabled())
77  {
78  NEActivationLayer::validate(output, nullptr, act_info);
79  }
80  return Status{};
81 }
82 
83 inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
84  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
85 {
89  if(act_info.enabled())
90  {
91  NEActivationLayer::validate(output, nullptr, act_info);
92  }
93  return Status{};
94 }
95 
96 inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
97  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
98 {
102  ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 6, 1, 3>::validate(batched_mm_output, biases, output, winograd_info)));
103  if(act_info.enabled())
104  {
105  NEActivationLayer::validate(output, nullptr, act_info);
106  }
107  return Status{};
108 }
109 
110 inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
111  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
112 {
116  ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 6, 1, 3, 1>::validate(batched_mm_output, biases, output, winograd_info)));
117 
118  if(act_info.enabled())
119  {
120  NEActivationLayer::validate(output, nullptr, act_info);
121  }
122  return Status{};
123 }
124 
125 inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
126  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
127 {
131  ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 4, 1, 5>::validate(batched_mm_output, biases, output, winograd_info)));
132  if(act_info.enabled())
133  {
134  NEActivationLayer::validate(output, nullptr, act_info);
135  }
136  return Status{};
137 }
138 inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
139  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
140 {
144  ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 1, 5, 1>::validate(batched_mm_output, biases, output, winograd_info)));
145  if(act_info.enabled())
146  {
147  NEActivationLayer::validate(output, nullptr, act_info);
148  }
149  return Status{};
150 }
151 
152 inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
153  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
154 {
158  ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 1, 2, 1, 7>::validate(batched_mm_output, biases, output, winograd_info)));
159  if(act_info.enabled())
160  {
161  NEActivationLayer::validate(output, nullptr, act_info);
162  }
163  return Status{};
164 }
165 
166 inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
167  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
168 {
172  ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 2, 1, 7, 1>::validate(batched_mm_output, biases, output, winograd_info)));
173 
174  if(act_info.enabled())
175  {
176  NEActivationLayer::validate(output, nullptr, act_info);
177  }
178  return Status{};
179 }
180 
181 inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
182 {
183  const DataLayout data_layout = input->info()->data_layout();
184  const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
185  const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
186  const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
187  const int in_batches = input->info()->dimension(3);
188 
189  return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
190 }
191 
192 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
193 {
194  ARM_COMPUTE_UNUSED(output);
196 
197  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
198  if(biases != nullptr)
199  {
201  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
202  }
204 }
205 
206 Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
207 {
208  Size2D output_tile = Size2D{};
209  if(kernel_dims == Size2D(3U, 3U))
210  {
211  output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
212  if(data_type == DataType::F16)
213  {
214  output_tile = Size2D(4U, 4U);
215  }
216  }
217  else if(kernel_dims == Size2D(5U, 5U))
218  {
219  output_tile = Size2D(2U, 2U);
220  }
221  else if(kernel_dims == Size2D(1U, 3U))
222  {
223  output_tile = Size2D(1U, 6U);
224  }
225  else if(kernel_dims == Size2D(3U, 1U))
226  {
227  output_tile = Size2D(6U, 1U);
228  }
229  else if(kernel_dims == Size2D(1U, 5U))
230  {
231  output_tile = Size2D(1U, 4U);
232  }
233  else if(kernel_dims == Size2D(5U, 1U))
234  {
235  output_tile = Size2D(4U, 1U);
236  }
237  else if(kernel_dims == Size2D(7U, 1U))
238  {
239  output_tile = Size2D(2U, 1U);
240  }
241  else if(kernel_dims == Size2D(1U, 7U))
242  {
243  output_tile = Size2D(1U, 2U);
244  }
245  return output_tile;
246 }
247 
248 bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
249 {
250  // Check if we want to configure a Winograd configuration which requires fast math
251  using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
252 
253  const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
254  {
255  WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
256  };
257 
258  const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
259  {
260  WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
261  WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
262  };
263 
264  auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
265  std::pair<int, int>(kernel_size.width, kernel_size.height));
266 
267  switch(data_type)
268  {
269  case DataType::F16:
270  return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
271  case DataType::F32:
272  return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
273  default:
274  return false;
275  }
276 }
277 
278 inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
279 {
280  return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
281 }
282 
283 arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
284 {
285  switch(act_info.activation())
286  {
288  {
289  return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
290  }
292  {
293  return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
294  }
295  default:
296  {
298  }
299  }
300 }
301 } //namespace
302 
303 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
304  : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
305  _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
306  _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
307 {
308 }
309 
310 void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
311  bool enable_fast_math)
312 {
313  ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
314  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
315 
316  // Get indices for the width and height
317  const DataLayout data_layout = input->info()->data_layout();
318  const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
319  const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
320  const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
321 
322  const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
323  const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
324  const DataType data_type = input->info()->data_type();
325  const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
326 
327  // Check if the Winograd configuration requires fast math
328  if(!enable_fast_math)
329  {
330  ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
331  "This Winograd configuration requires enable_fast_math=true");
332  }
333 
334  _weights = weights;
335  _input = input;
336  _output = output;
337  _is_prepared = false;
338 
339  int n_gemms = 0;
340  int N_BLOCK = 0; // Size of block used by GEMM.
341 
342  std::unique_ptr<INEWinogradLayerTransformInputKernel> transform_input_kernel;
343  std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel;
344  std::unique_ptr<INEWinogradLayerTransformOutputKernel> transform_output_kernel;
345 
346  if(data_type == DataType::F32)
347  {
348  if(kernel_size == Size2D(3, 3))
349  {
350  if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
351  {
353  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
354  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
355  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
356  n_gemms = config::WinogradBase::N_GEMMS;
357  N_BLOCK = config::WinogradConv::N_BLOCK;
358  }
359  else
360  {
362  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
363  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
364  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
365  n_gemms = config::WinogradBase::N_GEMMS;
366  N_BLOCK = config::WinogradConv::N_BLOCK;
367  }
368  }
369  else if(kernel_size == Size2D(5, 5))
370  {
372  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
373  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
374  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
375  n_gemms = config::WinogradBase::N_GEMMS;
376  N_BLOCK = config::WinogradConv::N_BLOCK;
377  }
378  else if(kernel_size == Size2D(1, 3))
379  {
381  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
382  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
383  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
384  n_gemms = config::WinogradBase::N_GEMMS;
385  N_BLOCK = config::WinogradConv::N_BLOCK;
386  }
387  else if(kernel_size == Size2D(3, 1))
388  {
390  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
391  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
392  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
393  n_gemms = config::WinogradBase::N_GEMMS;
394  N_BLOCK = config::WinogradConv::N_BLOCK;
395  }
396  else if(kernel_size == Size2D(1, 5))
397  {
399  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
400  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
401  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
402  n_gemms = config::WinogradBase::N_GEMMS;
403  N_BLOCK = config::WinogradConv::N_BLOCK;
404  }
405  else if(kernel_size == Size2D(5, 1))
406  {
408  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
409  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
410  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
411  n_gemms = config::WinogradBase::N_GEMMS;
412  N_BLOCK = config::WinogradConv::N_BLOCK;
413  }
414  else if(kernel_size == Size2D(1, 7))
415  {
417  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
418  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
419  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
420  n_gemms = config::WinogradBase::N_GEMMS;
421  N_BLOCK = config::WinogradConv::N_BLOCK;
422  }
423  else if(kernel_size == Size2D(7, 1))
424  {
426  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
427  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
428  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
429  n_gemms = config::WinogradBase::N_GEMMS;
430  N_BLOCK = config::WinogradConv::N_BLOCK;
431  }
432  else
433  {
434  ARM_COMPUTE_ERROR("Not supported.");
435  }
436  }
437 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
438  else if(data_type == DataType::F16)
439  {
440  if(kernel_size == Size2D(3, 3))
441  {
443  transform_input_kernel = std::make_unique<config::TransformInputKernel>();
444  transform_weights_kernel = std::make_unique<config::TransformWeightsKernel>();
445  transform_output_kernel = std::make_unique<config::TransformOutputKernel>();
446  n_gemms = config::WinogradBase::N_GEMMS;
447  N_BLOCK = config::WinogradConv::N_BLOCK;
448  }
449  else
450  {
451  ARM_COMPUTE_ERROR("Not supported.");
452  }
453  }
454 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
455 
456  const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
457  const bool use_same_padding = use_padding_type == PADDING_SAME;
458 
459  // Get convolved dimensions
460  const int in_channels = input->info()->dimension(channel_idx);
461  const int out_channels = output->info()->dimension(channel_idx);
462 
463  const Tensor4DShape in_shape(internal_get_input_shape(input));
464  const size_t data_type_size = input->info()->element_size();
465  // Get the memory required to instantiate a new Winograd operator.
466  constexpr size_t storage_alignment = 64;
467 
468  // Kernel Storage
469  const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
470  in_channels)
471  * data_type_size;
472 
473  // Input storage
474  const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
475  use_same_padding)
476  * data_type_size;
477 
478  // Output storage
479  const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
480  const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
481  const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
482  const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
483  const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
484 
485  // Configure GEMM
486  const int tile_rows = iceildiv(output_shape.first, output_tile.height);
487  const int tile_cols = iceildiv(output_shape.second, output_tile.width);
488  const int m = in_shape.n_batches * tile_rows * tile_cols;
489  const int k = in_shape.n_channels;
490  const int n = out_channels;
491  const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
492  const int output_matrix_row_stride = kernel_matrix_row_stride;
493 
494  TensorShape a_shape(k, m, 1, n_gemms);
495  Strides a_strides(data_type_size);
496  a_strides.set(1, a_strides[0] * k);
497  //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
498  a_strides.set(2, 0);
499  a_strides.set(3, data_type_size * input_matrix_stride);
500 
501  TensorShape b_shape(n, k, n_gemms);
502  Strides b_strides(data_type_size);
503  b_strides.set(1, data_type_size * kernel_matrix_row_stride);
504  b_strides.set(2, data_type_size * kernel_matrix_stride);
505 
506  TensorShape d_shape(n, m, 1, n_gemms);
507  Strides d_strides(data_type_size);
508  d_strides.set(1, data_type_size * output_matrix_row_stride);
509  //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
510  d_strides.set(2, 0);
511  d_strides.set(3, data_type_size * output_matrix_stride);
512 
513  TensorInfo a_info{};
514  TensorInfo b_info{};
515  TensorInfo d_info{};
516  a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
517  b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
518  d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
519 
520  _input_transformed.allocator()->init(a_info, storage_alignment);
521  _kernel_storage.allocator()->init(b_info, storage_alignment);
522  _output_transformed.allocator()->init(d_info, storage_alignment);
523 
524  // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
525  TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
526  _output->info()->dimension(1), _output->info()->dimension(3)),
527  1, _output->info()->data_type());
528  _output_nhwc.allocator()->init(info);
529 
530  const ITensor *input_to_use = _input;
531  ITensor *output_to_use = _output;
532  PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
533  const unsigned int max_num_threads = NEScheduler::get().num_threads();
534 
535  // Configure the kernel to transform the input tensor from NCHW -> NHWC
536  if(data_layout == DataLayout::NCHW)
537  {
538  _memory_group.manage(&_input_nhwc);
539  _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
540  input_to_use = &_input_nhwc;
541  weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
542  }
543 
544  // Configure input transform kernel
545  _memory_group.manage(&_input_transformed);
546  _memory_group.manage(&_input_workspace);
547  transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
548  &_input_transformed, input_matrix_stride, &_input_workspace);
549  const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
550  TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
551  _input_workspace.allocator()->init(input_workspace_info);
552  _input_workspace.allocator()->allocate();
553  if(data_layout == DataLayout::NCHW)
554  {
555  _input_nhwc.allocator()->allocate();
556  }
557 
558  // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
559  _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
560  transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
561 
562  // Configure GEMM function
563  _memory_group.manage(&_output_transformed);
564  _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
565  _input_transformed.allocator()->allocate();
566 
567  // Configure output transform function
568  // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
569  if(data_layout == DataLayout::NCHW)
570  {
571  _memory_group.manage(&_output_nhwc);
572  output_to_use = &_output_nhwc;
573  }
574  const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
575 
576  transform_output_kernel->configure(biases,
577  &_output_transformed,
578  output_matrix_stride,
579  output_to_use,
580  in_shape.n_batches,
581  output_shape.first,
582  output_shape.second,
583  out_channels,
584  &_output_workspace,
585  activation);
586 
587  const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
588  TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
589  _output_workspace.allocator()->init(output_workspace_info);
590  _output_workspace.allocator()->allocate();
591  _output_transformed.allocator()->allocate();
592 
593  // Reorder the convoluted output to ACL's ordering NCHW
594  if(data_layout == DataLayout::NCHW)
595  {
596  _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
597  _output_nhwc.allocator()->allocate();
598  }
599 
600  _transform_input_kernel = std::move(transform_input_kernel);
601  _transform_weights_kernel = std::move(transform_weights_kernel);
602  _transform_output_kernel = std::move(transform_output_kernel);
603 
604  //Configure Activation Layer
605  _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info);
606  if(_is_activationlayer_enabled)
607  {
608  _activationlayer_function.configure(_output, nullptr, act_info);
609  }
610 }
611 
613 {
614  const DataLayout data_layout = _input->info()->data_layout();
615 
616  prepare();
617 
618  MemoryGroupResourceScope scope_mg(_memory_group);
619 
620  if(data_layout == DataLayout::NCHW)
621  {
622  //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
623  _permute_input.run();
624  }
625 
626  // Transform input tensor to the winograd domain
627  NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
628 
629  //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
630  _gemm_function.run();
631 
632  // Transform output tensor to the spatial domain
633  NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
634 
635  if(data_layout == DataLayout::NCHW)
636  {
637  // Reorder the convoluted output to ACL's ordering NCHW
638  _permute_output.run();
639  }
640 
641  if(_is_activationlayer_enabled)
642  {
643  _activationlayer_function.run();
644  }
645 }
646 
647 Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
648  const ActivationLayerInfo &act_info, bool enable_fast_math)
649 {
650  ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
651  ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
652 
653  // Get indices for the width and height
656 
657  // Input shape, kernel size and output tile
658  const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
659  const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
660  const DataType data_type = input->data_type();
661  const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
662 
663  // Check if the Winograd configuration requires fast math
664  if(!enable_fast_math)
665  {
666  ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
667  "This Winograd configuration requires enable_fast_math=true");
668  }
669 
670  const WinogradInfo winograd_info = WinogradInfo(output_tile,
671  kernel_size,
672  input_dims,
673  conv_info,
674  input->data_layout());
675 
676  // Validate input transform
677  const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info);
678  const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
679  // Validate filter transform
680  const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info);
681  const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
682  // Validate batched matrix multiply
683  TensorShape batched_mm_output_shape = input0.tensor_shape();
684  batched_mm_output_shape[0] = input1.tensor_shape()[0];
685  const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
686 
687  if(kernel_size == Size2D(3, 3))
688  {
689  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
690  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
691  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
692  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
693  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
694  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
695  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
696  return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
697  }
698  else if(kernel_size == Size2D(5, 5))
699  {
700  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
701  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
702  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
703  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
704  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
705  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
706  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
707  return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
708  }
709  if(kernel_size == Size2D(3, 1))
710  {
711  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
712  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
713  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
714  return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
715  }
716  else if(kernel_size == Size2D(1, 3))
717  {
718  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
719  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
720  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
721  return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
722  }
723  else if(kernel_size == Size2D(5, 1))
724  {
725  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
726  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
727  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
728  return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
729  }
730  else if(kernel_size == Size2D(1, 5))
731  {
732  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
733  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
734  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
735  return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
736  }
737  else if(kernel_size == Size2D(7, 1))
738  {
739  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
740  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
741  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
742  return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
743  }
744  else if(kernel_size == Size2D(1, 7))
745  {
746  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
747  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
748  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
749  return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
750  }
751  else
752  {
753  ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
754  }
755 }
756 
758 {
759  if(!_is_prepared)
760  {
761  // Permute weights
762  _weights_hwio.allocator()->allocate();
763  _permute_weights.run();
764  _weights->mark_as_unused();
765 
766  // Transform weights
767  _kernel_storage.allocator()->allocate();
768  NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
769  _weights_hwio.allocator()->free();
770 
771  _gemm_function.prepare();
772  if(!_kernel_storage.is_used())
773  {
774  _kernel_storage.allocator()->free();
775  }
776 
777  _is_prepared = true;
778  }
779 }
780 } // namespace arm_compute
T roundup(const T a, const T b)
Definition: utils.hpp:45
void set(size_t dimension, T value, bool increase_dim_unit=true)
Accessor to set the value of one of the dimensions.
Definition: Dimensions.h:76
Shape of a tensor.
Definition: TensorShape.h:39
TensorShape compute_winograd_input_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
Calculate the winograd input transform shape.
std::unique_ptr< ITensorInfo > clone() const override
Provide a clone of the current object of class T.
Definition: TensorInfo.cpp:316
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo &sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
NEWinogradConvolutionLayer(const std::shared_ptr< IMemoryManager > &memory_manager=nullptr)
Constructor.
bool enabled() const
Check if initialised.
Definition: Types.h:1600
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
Winograd information.
Definition: Types.h:2182
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
Definition: ITensor.cpp:163
T iceildiv(const T a, const T b)
Definition: utils.hpp:40
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
[NEActivationLayer snippet]
1 channel, 1 F32 per channel
Strides PermutationVector
Permutation vector.
Definition: Types.h:49
const DataLayout data_layout
Definition: Im2Col.cpp:151
Store the tensor&#39;s metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
unsigned int pad_top() const
Get the top padding.
Definition: Types.h:806
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
Definition: Types.h:70
Status class.
Definition: Error.h:52
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Set the input and output tensors.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Activation Layer Information class.
Definition: Types.h:1550
Interface for Neon tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2021 Arm Limited.
size_t height
Height of the image region or rectangle.
Definition: Size2D.h:90
1 channel, 1 F16 per channel
TensorAllocator * allocator()
Return a pointer to the tensor&#39;s allocator.
Definition: Tensor.cpp:48
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
static Status validate(const ITensorInfo *input, const ITensorInfo *weights)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:168
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
const DataType data_type
Definition: Im2Col.cpp:150
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Static function to check if given info will lead to a valid configuration of NEGEMMConvolutionLayer.
void run() override
Run the kernels contained in the function.
Definition: NEGEMM.cpp:309
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor&#39;s metadata.
unsigned int pad_right() const
Get the right padding.
Definition: Types.h:801
Padding and stride information class.
Definition: Types.h:722
virtual size_t element_size() const =0
Element size in bytes calculated as data_size() * num_channels()
TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
Calculate the winograd filter transform shape.
void free() override
Free allocated CPU memory.
void prepare() override
Prepare the function for executing.
Num samples, channels, height, width.
void init(Format format)
Initialize the tensor info with just a format.
Definition: TensorInfo.cpp:109
Strides of an item in bytes.
Definition: Strides.h:37
void run() override
Run the kernels contained in the function.
ScaleKernelInfo info(interpolation_policy, default_border_mode, PixelValue(), sampling_policy, false)
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Definition: Error.h:194
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
size_t width
Width of the image region or rectangle.
Definition: Size2D.h:89
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel&#39;s inputs, output.
Definition: NEGEMM.cpp:72
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
[NEActivationLayer snippet]
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
void prepare() override
Prepare the function for executing.
Definition: NEGEMM.cpp:359
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
void configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
Configure the permute CPP kernel.
Definition: CPPPermute.cpp:30
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
Store the tensor&#39;s metadata.
Definition: TensorInfo.h:45
void run() override final
Run the kernels contained in the function.
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:193
unsigned int pad_bottom() const
Get the bottom padding.
Definition: Types.h:811
const TensorShape & tensor_shape() const override
Size for each dimension of the tensor.
Definition: TensorInfo.h:262
DataType
Available data types.
Definition: Types.h:77
unsigned int pad_left() const
Get the left padding.
Definition: Types.h:796
def find(path, pattern)
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
DataLayout
[DataLayout enum definition]
Definition: Types.h:120
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:94