Compute Library
 20.08
NEWinogradConvolutionLayer.cpp
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 
27 #include "arm_compute/core/Error.h"
28 #include "arm_compute/core/Utils.h"
34 #include "support/MemorySupport.h"
35 
36 #include "arm_compute/core/NEON/kernels/convolution/common/utils.hpp"
37 #include "src/core/NEON/kernels/convolution/winograd/winograd.hpp"
38 
39 namespace arm_compute
40 {
41 namespace
42 {
43 inline Status validate_kernel_3x3(const Size2D input_dims, const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
44  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
45 {
48 
49  if(input->data_type() == DataType::F32)
50  {
51  if(input_dims.width > 4 && input_dims.height > 4)
52  {
56  }
57  else
58  {
62  }
63  }
64 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
65  else if(input->data_type() == DataType::F16)
66  {
70  }
71 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
72 
73  if(act_info.enabled())
74  {
75  NEActivationLayer::validate(output, nullptr, act_info);
76  }
77  return Status{};
78 }
79 
80 inline Status validate_kernel_5x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
81  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
82 {
86  if(act_info.enabled())
87  {
88  NEActivationLayer::validate(output, nullptr, act_info);
89  }
90  return Status{};
91 }
92 
93 inline Status validate_kernel_3x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
94  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
95 {
100  if(act_info.enabled())
101  {
102  NEActivationLayer::validate(output, nullptr, act_info);
103  }
104  return Status{};
105 }
106 
107 inline Status validate_kernel_1x3(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
108  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
109 {
114 
115  if(act_info.enabled())
116  {
117  NEActivationLayer::validate(output, nullptr, act_info);
118  }
119  return Status{};
120 }
121 
122 inline Status validate_kernel_5x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
123  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
124 {
129  if(act_info.enabled())
130  {
131  NEActivationLayer::validate(output, nullptr, act_info);
132  }
133  return Status{};
134 }
135 inline Status validate_kernel_1x5(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
136  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
137 {
142  if(act_info.enabled())
143  {
144  NEActivationLayer::validate(output, nullptr, act_info);
145  }
146  return Status{};
147 }
148 
149 inline Status validate_kernel_7x1(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
150  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
151 {
156  if(act_info.enabled())
157  {
158  NEActivationLayer::validate(output, nullptr, act_info);
159  }
160  return Status{};
161 }
162 
163 inline Status validate_kernel_1x7(const ITensorInfo *input, const TensorInfo *input0, const TensorInfo *input1, const TensorInfo *batched_mm_output,
164  const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info)
165 {
170 
171  if(act_info.enabled())
172  {
173  NEActivationLayer::validate(output, nullptr, act_info);
174  }
175  return Status{};
176 }
177 
178 inline Tensor4DShape internal_get_input_shape(const arm_compute::ITensor *input)
179 {
180  const DataLayout data_layout = input->info()->data_layout();
181  const int in_width = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH));
182  const int in_height = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT));
183  const int in_channels = input->info()->dimension(get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL));
184  const int in_batches = input->info()->dimension(3);
185 
186  return Tensor4DShape{ in_batches, in_height, in_width, in_channels };
187 }
188 
189 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info)
190 {
191  ARM_COMPUTE_UNUSED(output);
193 
194  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd layer only supports unit strides.");
195  if(biases != nullptr)
196  {
198  ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
199  }
201 }
202 
203 Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataType data_type)
204 {
205  Size2D output_tile = Size2D{};
206  if(kernel_dims == Size2D(3U, 3U))
207  {
208  output_tile = (input_dims.width <= 4 || input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
209  if(data_type == DataType::F16)
210  {
211  output_tile = Size2D(4U, 4U);
212  }
213  }
214  else if(kernel_dims == Size2D(5U, 5U))
215  {
216  output_tile = Size2D(2U, 2U);
217  }
218  else if(kernel_dims == Size2D(1U, 3U))
219  {
220  output_tile = Size2D(1U, 6U);
221  }
222  else if(kernel_dims == Size2D(3U, 1U))
223  {
224  output_tile = Size2D(6U, 1U);
225  }
226  else if(kernel_dims == Size2D(1U, 5U))
227  {
228  output_tile = Size2D(1U, 4U);
229  }
230  else if(kernel_dims == Size2D(5U, 1U))
231  {
232  output_tile = Size2D(4U, 1U);
233  }
234  else if(kernel_dims == Size2D(7U, 1U))
235  {
236  output_tile = Size2D(2U, 1U);
237  }
238  else if(kernel_dims == Size2D(1U, 7U))
239  {
240  output_tile = Size2D(1U, 2U);
241  }
242  return output_tile;
243 }
244 
245 bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size, DataType data_type)
246 {
247  // Check if we want to configure a Winograd configuration which requires fast math
248  using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
249 
250  const std::vector<WinogradConfiguration> fast_math_winograd_f16 =
251  {
252  WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(3, 3))
253  };
254 
255  const std::vector<WinogradConfiguration> fast_math_winograd_f32 =
256  {
257  WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
258  WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
259  };
260 
261  auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
262  std::pair<int, int>(kernel_size.width, kernel_size.height));
263 
264  switch(data_type)
265  {
266  case DataType::F16:
267  return std::find(fast_math_winograd_f16.begin(), fast_math_winograd_f16.end(), p) != fast_math_winograd_f16.end();
268  case DataType::F32:
269  return std::find(fast_math_winograd_f32.begin(), fast_math_winograd_f32.end(), p) != fast_math_winograd_f32.end();
270  default:
271  return false;
272  }
273 }
274 
275 inline bool fuse_function_supported(const ActivationLayerInfo &act_info)
276 {
277  return act_info.activation() == ActivationLayerInfo::ActivationFunction::RELU || act_info.activation() == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU;
278 }
279 
280 arm_gemm::Activation arm_gemm_activation_from_acl_activation(const ActivationLayerInfo &act_info)
281 {
282  switch(act_info.activation())
283  {
285  {
286  return arm_gemm::Activation(arm_gemm::Activation::Type::ReLU, act_info.a(), act_info.b());
287  }
289  {
290  return arm_gemm::Activation(arm_gemm::Activation::Type::BoundedReLU, act_info.a(), act_info.b());
291  }
292  default:
293  {
294  return arm_gemm::Activation(arm_gemm::Activation::Type::None);
295  }
296  }
297 }
298 } //namespace
299 
300 NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager)
301  : _memory_group(memory_manager), _gemm_function(memory_manager), _transform_input_kernel(nullptr), _transform_output_kernel(nullptr), _transform_weights_kernel(nullptr), _activationlayer_function(),
302  _permute_input(), _permute_weights(), _permute_output(), _input_transformed(), _output_transformed(), _input_workspace(), _output_workspace(), _kernel_storage(), _input_nhwc(), _output_nhwc(),
303  _weights_hwio(), _input(), _weights(), _output(), _is_prepared(false), _is_activationlayer_enabled(false)
304 {
305 }
306 
308  bool enable_fast_math)
309 {
311  ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
312 
313  // Get indices for the width and height
314  const DataLayout data_layout = input->info()->data_layout();
318 
319  const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
320  const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
321  const DataType data_type = input->info()->data_type();
322  const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
323 
324  // Check if the Winograd configuration requires fast math
325  if(!enable_fast_math)
326  {
327  ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
328  "This Winograd configuration requires enable_fast_math=true");
329  }
330 
331  _weights = weights;
332  _input = input;
333  _output = output;
334  _is_prepared = false;
335 
336  int n_gemms = 0;
337  int N_BLOCK = 0; // Size of block used by GEMM.
338 
339  std::unique_ptr<INEWinogradLayerTransformInputKernel> transform_input_kernel;
340  std::unique_ptr<INEWinogradLayerTransformWeightsKernel> transform_weights_kernel;
341  std::unique_ptr<INEWinogradLayerTransformOutputKernel> transform_output_kernel;
342 
343  if(data_type == DataType::F32)
344  {
345  if(kernel_size == Size2D(3, 3))
346  {
347  if(input->info()->dimension(width_idx) > 4 && input->info()->dimension(height_idx) > 4)
348  {
350  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
351  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
352  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
353  n_gemms = config::WinogradBase::N_GEMMS;
354  N_BLOCK = config::WinogradConv::N_BLOCK;
355  }
356  else
357  {
359  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
360  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
361  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
362  n_gemms = config::WinogradBase::N_GEMMS;
363  N_BLOCK = config::WinogradConv::N_BLOCK;
364  }
365  }
366  else if(kernel_size == Size2D(5, 5))
367  {
369  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
370  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
371  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
372  n_gemms = config::WinogradBase::N_GEMMS;
373  N_BLOCK = config::WinogradConv::N_BLOCK;
374  }
375  else if(kernel_size == Size2D(1, 3))
376  {
378  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
379  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
380  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
381  n_gemms = config::WinogradBase::N_GEMMS;
382  N_BLOCK = config::WinogradConv::N_BLOCK;
383  }
384  else if(kernel_size == Size2D(3, 1))
385  {
387  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
388  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
389  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
390  n_gemms = config::WinogradBase::N_GEMMS;
391  N_BLOCK = config::WinogradConv::N_BLOCK;
392  }
393  else if(kernel_size == Size2D(1, 5))
394  {
396  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
397  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
398  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
399  n_gemms = config::WinogradBase::N_GEMMS;
400  N_BLOCK = config::WinogradConv::N_BLOCK;
401  }
402  else if(kernel_size == Size2D(5, 1))
403  {
405  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
406  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
407  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
408  n_gemms = config::WinogradBase::N_GEMMS;
409  N_BLOCK = config::WinogradConv::N_BLOCK;
410  }
411  else if(kernel_size == Size2D(1, 7))
412  {
414  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
415  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
416  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
417  n_gemms = config::WinogradBase::N_GEMMS;
418  N_BLOCK = config::WinogradConv::N_BLOCK;
419  }
420  else if(kernel_size == Size2D(7, 1))
421  {
423  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
424  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
425  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
426  n_gemms = config::WinogradBase::N_GEMMS;
427  N_BLOCK = config::WinogradConv::N_BLOCK;
428  }
429  else
430  {
431  ARM_COMPUTE_ERROR("Not supported.");
432  }
433  }
434 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
435  else if(data_type == DataType::F16)
436  {
437  if(kernel_size == Size2D(3, 3))
438  {
440  transform_input_kernel = support::cpp14::make_unique<config::TransformInputKernel>();
441  transform_weights_kernel = support::cpp14::make_unique<config::TransformWeightsKernel>();
442  transform_output_kernel = support::cpp14::make_unique<config::TransformOutputKernel>();
443  n_gemms = config::WinogradBase::N_GEMMS;
444  N_BLOCK = config::WinogradConv::N_BLOCK;
445  }
446  else
447  {
448  ARM_COMPUTE_ERROR("Not supported.");
449  }
450  }
451 #endif // __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
452 
453  const PaddingType use_padding_type = (conv_info.pad_top() != 0u || conv_info.pad_left() != 0) ? PADDING_SAME : PADDING_VALID;
454  const bool use_same_padding = use_padding_type == PADDING_SAME;
455 
456  // Get convolved dimensions
457  const int in_channels = input->info()->dimension(channel_idx);
458  const int out_channels = output->info()->dimension(channel_idx);
459 
460  const Tensor4DShape in_shape(internal_get_input_shape(input));
461  const size_t data_type_size = input->info()->element_size();
462  // Get the memory required to instantiate a new Winograd operator.
463  constexpr size_t storage_alignment = 64;
464 
465  // Kernel Storage
466  const size_t kernel_storage_size = transform_weights_kernel->get_weight_storage_size(out_channels,
467  in_channels)
468  * data_type_size;
469 
470  // Input storage
471  const size_t input_storage_size = transform_input_kernel->get_input_storage_size(in_shape.n_batches, in_shape.n_channels, in_shape.n_rows, in_shape.n_cols,
472  use_same_padding)
473  * data_type_size;
474 
475  // Output storage
476  const size_t output_storage_size = transform_output_kernel->get_output_storage_size(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels) * data_type_size;
477  const int kernel_matrix_stride = transform_weights_kernel->get_matrix_stride(out_channels, in_channels);
478  const int output_matrix_stride = transform_output_kernel->get_matrix_stride(in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, out_channels);
479  const auto output_shape = transform_output_kernel->get_output_shape(in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
480  const int input_matrix_stride = transform_input_kernel->get_matrix_stride(in_shape.n_batches, in_channels, in_shape.n_rows, in_shape.n_cols, use_padding_type == PADDING_SAME);
481 
482  // Configure GEMM
483  const int tile_rows = iceildiv(output_shape.first, output_tile.height);
484  const int tile_cols = iceildiv(output_shape.second, output_tile.width);
485  const int m = in_shape.n_batches * tile_rows * tile_cols;
486  const int k = in_shape.n_channels;
487  const int n = out_channels;
488  const int kernel_matrix_row_stride = roundup(out_channels, N_BLOCK);
489  const int output_matrix_row_stride = kernel_matrix_row_stride;
490 
491  TensorShape a_shape(k, m, 1, n_gemms);
492  Strides a_strides(data_type_size);
493  a_strides.set(1, a_strides[0] * k);
494  //a_strides.set(2, data_type_size * input_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
495  a_strides.set(2, 0);
496  a_strides.set(3, data_type_size * input_matrix_stride);
497 
498  TensorShape b_shape(n, k, n_gemms);
499  Strides b_strides(data_type_size);
500  b_strides.set(1, data_type_size * kernel_matrix_row_stride);
501  b_strides.set(2, data_type_size * kernel_matrix_stride);
502 
503  TensorShape d_shape(n, m, 1, n_gemms);
504  Strides d_strides(data_type_size);
505  d_strides.set(1, data_type_size * output_matrix_row_stride);
506  //d_strides.set(2, data_type_size * output_matrix_stride / n_gemms); FIXME: This is the real batch size, but RSH's code crashes if it's not 0.
507  d_strides.set(2, 0);
508  d_strides.set(3, data_type_size * output_matrix_stride);
509 
510  TensorInfo a_info{};
511  TensorInfo b_info{};
512  TensorInfo d_info{};
513  a_info.init(a_shape, 1, data_type, a_strides, 0, input_storage_size);
514  b_info.init(b_shape, 1, data_type, b_strides, 0, kernel_storage_size);
515  d_info.init(d_shape, 1, data_type, d_strides, 0, output_storage_size);
516 
517  _input_transformed.allocator()->init(a_info, storage_alignment);
518  _kernel_storage.allocator()->init(b_info, storage_alignment);
519  _output_transformed.allocator()->init(d_info, storage_alignment);
520 
521  // configure and allocate dst tensor to be used to convert from winograd domain to spatial domain when calling to reshape_output()
522  TensorInfo info(TensorShape(_output->info()->dimension(2), _output->info()->dimension(0),
523  _output->info()->dimension(1), _output->info()->dimension(3)),
524  1, _output->info()->data_type());
525  _output_nhwc.allocator()->init(info);
526 
527  const ITensor *input_to_use = _input;
528  ITensor *output_to_use = _output;
529  PermutationVector weights_permutation_vector(3U, 0U, 1U, 2U);
530  const unsigned int max_num_threads = NEScheduler::get().num_threads();
531 
532  // Configure the kernel to transform the input tensor from NCHW -> NHWC
534  {
535  _memory_group.manage(&_input_nhwc);
536  _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
537  input_to_use = &_input_nhwc;
538  weights_permutation_vector = PermutationVector(3U, 2U, 0U, 1U);
539  }
540 
541  // Configure input transform kernel
542  _memory_group.manage(&_input_transformed);
543  _memory_group.manage(&_input_workspace);
544  transform_input_kernel->configure(input_to_use, in_shape.n_batches, in_shape.n_rows, in_shape.n_cols, in_shape.n_channels, use_padding_type,
545  &_input_transformed, input_matrix_stride, &_input_workspace);
546  const size_t input_workspace_size = transform_input_kernel->get_working_space_size(max_num_threads);
547  TensorInfo input_workspace_info(TensorShape(input_workspace_size), 1, _input->info()->data_type());
548  _input_workspace.allocator()->init(input_workspace_info);
549  _input_workspace.allocator()->allocate();
551  {
552  _input_nhwc.allocator()->allocate();
553  }
554 
555  // Re-order a weight tensor from [Output feature map x Input feature map x Height x Width] to [Height x Width x Input feature map x Output feature map]
556  _permute_weights.configure(weights, &_weights_hwio, weights_permutation_vector);
557  transform_weights_kernel->configure(&_weights_hwio, &_kernel_storage, kernel_matrix_stride, out_channels, in_channels);
558 
559  // Configure GEMM function
560  _memory_group.manage(&_output_transformed);
561  _gemm_function.configure(&_input_transformed, &_kernel_storage, nullptr, &_output_transformed, 1.0f, 0.f);
562  _input_transformed.allocator()->allocate();
563 
564  // Configure output transform function
565  // The biases tensor has not been allocated at this point in time, the output transform will add the biases to the final result in the run() method
567  {
568  _memory_group.manage(&_output_nhwc);
569  output_to_use = &_output_nhwc;
570  }
571  const arm_gemm::Activation activation = arm_gemm_activation_from_acl_activation(act_info);
572 
573  transform_output_kernel->configure(biases,
574  &_output_transformed,
575  output_matrix_stride,
576  output_to_use,
577  in_shape.n_batches,
578  output_shape.first,
579  output_shape.second,
580  out_channels,
581  &_output_workspace,
582  activation);
583 
584  const size_t output_workspace_size = transform_output_kernel->get_working_space_size(max_num_threads);
585  TensorInfo output_workspace_info(TensorShape(output_workspace_size), 1, _output->info()->data_type());
586  _output_workspace.allocator()->init(output_workspace_info);
587  _output_workspace.allocator()->allocate();
588  _output_transformed.allocator()->allocate();
589 
590  // Reorder the convoluted output to ACL's ordering NCHW
592  {
593  _permute_output.configure(&_output_nhwc, _output, PermutationVector(1U, 2U, 0U));
594  _output_nhwc.allocator()->allocate();
595  }
596 
597  _transform_input_kernel = std::move(transform_input_kernel);
598  _transform_weights_kernel = std::move(transform_weights_kernel);
599  _transform_output_kernel = std::move(transform_output_kernel);
600 
601  //Configure Activation Layer
602  _is_activationlayer_enabled = act_info.enabled() && !fuse_function_supported(act_info);
603  if(_is_activationlayer_enabled)
604  {
605  _activationlayer_function.configure(_output, nullptr, act_info);
606  }
607 }
608 
610 {
611  const DataLayout data_layout = _input->info()->data_layout();
612 
613  prepare();
614 
615  MemoryGroupResourceScope scope_mg(_memory_group);
616 
618  {
619  //Bring channels to the front as Winograd code expects the tensor to be in the format NHWC
620  _permute_input.run();
621  }
622 
623  // Transform input tensor to the winograd domain
624  NEScheduler::get().schedule(_transform_input_kernel.get(), Window::DimX);
625 
626  //Run 16 GEMMs in multiple threads, each kernel runs one or more GEMMs
627  _gemm_function.run();
628 
629  // Transform output tensor to the spatial domain
630  NEScheduler::get().schedule(_transform_output_kernel.get(), Window::DimX);
631 
633  {
634  // Reorder the convoluted output to ACL's ordering NCHW
635  _permute_output.run();
636  }
637 
638  if(_is_activationlayer_enabled)
639  {
640  _activationlayer_function.run();
641  }
642 }
643 
645  const ActivationLayerInfo &act_info, bool enable_fast_math)
646 {
649 
650  // Get indices for the width and height
651  const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
652  const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
653 
654  // Input shape, kernel size and output tile
655  const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
656  const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
657  const DataType data_type = input->data_type();
658  const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, data_type);
659 
660  // Check if the Winograd configuration requires fast math
661  if(!enable_fast_math)
662  {
663  ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size, data_type),
664  "This Winograd configuration requires enable_fast_math=true");
665  }
666 
667  const WinogradInfo winograd_info = WinogradInfo(output_tile,
668  kernel_size,
669  input_dims,
670  conv_info,
671  input->data_layout());
672 
673  // Validate input transform
675  const TensorInfo input0 = input->clone()->set_tensor_shape(input0_shape);
676  // Validate filter transform
678  const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
679  // Validate batched matrix multiply
680  TensorShape batched_mm_output_shape = input0.tensor_shape();
681  batched_mm_output_shape[0] = input1.tensor_shape()[0];
682  const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
683 
684  if(kernel_size == Size2D(3, 3))
685  {
686  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
687  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
688  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
689  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
690  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
691  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
692  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
693  return validate_kernel_3x3(input_dims, input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
694  }
695  else if(kernel_size == Size2D(5, 5))
696  {
697  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
698  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
699  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
700  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
701  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != conv_info.pad_left(), "Only SAME or VALID padding supported");
702  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_bottom(), "Only SAME or VALID padding supported");
703  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != conv_info.pad_left(), "Only SAME or VALID padding supported");
704  return validate_kernel_5x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
705  }
706  if(kernel_size == Size2D(3, 1))
707  {
708  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 1, "Only SAME or VALID padding supported");
709  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 1, "Only SAME or VALID padding supported");
710  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
711  return validate_kernel_3x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
712  }
713  else if(kernel_size == Size2D(1, 3))
714  {
715  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 1, "Only SAME or VALID padding supported");
716  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 1, "Only SAME or VALID padding supported");
717  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
718  return validate_kernel_1x3(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
719  }
720  else if(kernel_size == Size2D(5, 1))
721  {
722  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 2, "Only SAME or VALID padding supported");
723  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 2, "Only SAME or VALID padding supported");
724  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
725  return validate_kernel_5x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
726  }
727  else if(kernel_size == Size2D(1, 5))
728  {
729  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 2, "Only SAME or VALID padding supported");
730  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 2, "Only SAME or VALID padding supported");
731  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
732  return validate_kernel_1x5(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
733  }
734  else if(kernel_size == Size2D(7, 1))
735  {
736  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_left() != 3, "Only SAME or VALID padding supported");
737  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_right() != 0u && conv_info.pad_right() != 3, "Only SAME or VALID padding supported");
738  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_bottom() != 0, "Only SAME or VALID padding supported");
739  return validate_kernel_7x1(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
740  }
741  else if(kernel_size == Size2D(1, 7))
742  {
743  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_top() != 0u && conv_info.pad_top() != 3, "Only SAME or VALID padding supported");
744  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_bottom() != 0u && conv_info.pad_bottom() != 3, "Only SAME or VALID padding supported");
745  ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.pad_left() != 0u && conv_info.pad_right() != 0, "Only SAME or VALID padding supported");
746  return validate_kernel_1x7(input, &input0, &input1, &batched_mm_output, weights, biases, output, winograd_info, act_info);
747  }
748  else
749  {
750  ARM_COMPUTE_RETURN_ERROR_MSG("Kernel shape not supported");
751  }
752 }
753 
755 {
756  if(!_is_prepared)
757  {
758  // Permute weights
759  _weights_hwio.allocator()->allocate();
760  _permute_weights.run();
761  _weights->mark_as_unused();
762 
763  // Transform weights
764  _kernel_storage.allocator()->allocate();
765  NEScheduler::get().schedule(_transform_weights_kernel.get(), Window::DimX);
766 
767  _weights_hwio.allocator()->free();
768  _is_prepared = true;
769  }
770 }
771 } // namespace arm_compute
T iceildiv(const T a, const T b)
Definition: utils.hpp:36
Shape of a tensor.
Definition: TensorShape.h:39
const DataLayout data_layout
Definition: Im2Col.cpp:146
TensorShape compute_winograd_input_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
Calculate the winograd input transform shape.
std::unique_ptr< ITensorInfo > clone() const override
Provide a clone of the current object of class T.
Definition: TensorInfo.cpp:314
void init(const TensorAllocator &allocator, const Coordinates &coords, TensorInfo &sub_info)
Shares the same backing memory with another tensor allocator, while the tensor info might be differen...
NEWinogradConvolutionLayer(const std::shared_ptr< IMemoryManager > &memory_manager=nullptr)
Constructor.
bool enabled() const
Check if initialised.
Definition: Types.h:1567
virtual size_t dimension(size_t index) const =0
Return the size of the requested dimension.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
Definition: Validate.h:545
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
#define ARM_COMPUTE_ERROR(msg)
Print the given message then throw an std::runtime_error.
Definition: Error.h:352
Winograd information.
Definition: Types.h:2111
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
Definition: Error.h:204
virtual DataType data_type() const =0
Data type used for each element of the tensor.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
[NEActivationLayer snippet]
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
Definition: Validate.h:792
1 channel, 1 F32 per channel
Strides PermutationVector
Permutation vector.
Definition: Types.h:49
Store the tensor's metadata.
Definition: ITensorInfo.h:40
#define ARM_COMPUTE_ERROR_THROW_ON(status)
Definition: Error.h:455
arm_compute::ActivationLayerInfo::ActivationFunction Activation
Constant TensorID specifying an equivalent of null tensor.
Definition: Types.h:70
Status class.
Definition: Error.h:52
void configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Set the input and output tensors.
void set(size_t dimension, T value)
Accessor to set the value of one of the dimensions.
Definition: Dimensions.h:74
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
Definition: Error.h:296
Activation Layer Information class.
Definition: Types.h:1517
Interface for NEON tensor.
Definition: ITensor.h:36
Copyright (c) 2017-2020 Arm Limited.
size_t height
Height of the image region or rectangle.
Definition: Size2D.h:90
#define ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(tensor)
Definition: Validate.h:108
1 channel, 1 F16 per channel
TensorAllocator * allocator()
Return a pointer to the tensor's allocator.
Definition: Tensor.cpp:48
ITensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
Definition: Tensor.cpp:33
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
static Status validate(const ITensorInfo *input, const ITensorInfo *weights)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
void mark_as_unused() const
Marks a tensor as unused.
Definition: ITensor.cpp:168
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Definition: MemoryGroup.h:79
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Static function to check if given info will lead to a valid configuration of NEGEMMConvolutionLayer.
void run() override
Run the kernels contained in the function.
Definition: NEGEMM.cpp:281
static constexpr size_t DimX
Alias for dimension 0 also known as X dimension.
Definition: Window.h:43
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
Definition: Error.h:152
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
Definition: Error.h:456
void allocate() override
Allocate size specified by TensorInfo of CPU memory.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
Padding and stride information class.
Definition: Types.h:689
TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
Calculate the winograd filter transform shape.
void free() override
Free allocated CPU memory.
void prepare() override
Prepare the function for executing.
Num samples, channels, height, width.
void init(Format format)
Initialize the tensor info with just a format.
Definition: TensorInfo.cpp:107
Strides of an item in bytes.
Definition: Strides.h:37
T roundup(const T a, const T b)
Definition: utils.hpp:41
#define ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(...)
Definition: Validate.h:163
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_ERROR_ON_NULLPTR(...)
Definition: Validate.h:161
#define ARM_COMPUTE_RETURN_ERROR_MSG(...)
An error is returned with the given description.
Definition: Error.h:194
Memory group resources scope handling class.
Definition: IMemoryGroup.h:82
virtual void schedule(ICPPKernel *kernel, const Hints &hints)=0
Runs the kernel in the same thread as the caller synchronously.
size_t width
Width of the image region or rectangle.
Definition: Size2D.h:89
Class for specifying the size of an image or rectangle.
Definition: Size2D.h:34
void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, float alpha, float beta, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel's inputs, output.
Definition: NEGEMM.cpp:51
void configure(ITensor *input, ITensor *output, ActivationLayerInfo activation_info)
[NEActivationLayer snippet]
Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const GEMMLowpOutputStageInfo *output_stage)
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Definition: Error.h:244
void configure(const ITensor *input, ITensor *output, const PermutationVector &perm)
Configure the permute CPP kernel.
Definition: CPPPermute.cpp:31
Store the tensor's metadata.
Definition: TensorInfo.h:45
void run() override final
Run the kernels contained in the function.
virtual unsigned int num_threads() const =0
Returns the number of threads that the SingleThreadScheduler has in his pool.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
Definition: Helpers.inl:332
const TensorShape & tensor_shape() const override
Size for each dimension of the tensor.
Definition: TensorInfo.h:261
DataType
Available data types.
Definition: Types.h:77
static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info)
Static function to check if given info will lead to a valid configuration of NEWinogradLayerTransform...
DataLayout
[DataLayout enum definition]
Definition: Types.h:120
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.
static IScheduler & get()
Access the scheduler singleton.
Definition: Scheduler.cpp:95