24.02.1
|
Basic function to execute FFT-based convolution on OpenCL.
More...
#include <CLFFTConvolutionLayer.h>
|
| CLFFTConvolutionLayer (std::shared_ptr< IMemoryManager > memory_manager=nullptr) |
| Default constructor. More...
|
|
| CLFFTConvolutionLayer (const CLFFTConvolutionLayer &)=delete |
| Prevent instances of this class from being copied (As this class contains pointers) More...
|
|
| CLFFTConvolutionLayer (CLFFTConvolutionLayer &&)=default |
| Default move constructor. More...
|
|
CLFFTConvolutionLayer & | operator= (const CLFFTConvolutionLayer &)=delete |
| Prevent instances of this class from being copied (As this class contains pointers) More...
|
|
CLFFTConvolutionLayer & | operator= (CLFFTConvolutionLayer &&)=default |
| Default move assignment operator. More...
|
|
void | configure (ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false) |
| Set the input and output tensors. More...
|
|
void | configure (const CLCompileContext &compile_context, ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false) |
| Set the input and output tensors. More...
|
|
void | run () override |
| Run the kernels contained in the function. More...
|
|
void | prepare () override |
| Prepare the function for executing. More...
|
|
virtual | ~IFunction ()=default |
| Destructor. More...
|
|
Basic function to execute FFT-based convolution on OpenCL.
This function calls the following OpenCL functions/kernels:
- CLPermute Permute input if NHWC(only NCHW is supported).
- CLPadLayer Pad input.
- CLFFT2D Forward transform to the frequency domain.
- CLComplexPixelWiseMultiplication Complex element-wise product of input and the weights.
- CLReductionOperation Reduction across channels.
- CLFFT2D Inverse transform back to the time domain.
- CLStridedSlice Extract valid output.
- CLArithmeticAddition Add bias.
- CLActivationLayer Perform activation.
- CLPermute Permute output if NHWC(only NCHW is supported).
Definition at line 58 of file CLFFTConvolutionLayer.h.
◆ CLFFTConvolutionLayer() [1/3]
Default constructor.
Definition at line 65 of file CLFFTConvolutionLayer.cpp.
66 : _memory_group(memory_manager),
68 _permute_input_func(),
69 _permute_output_func(),
70 _permute_weights_func(),
74 _transform_input_func(memory_manager),
75 _transform_weights_func(),
76 _itransform_output_func(memory_manager),
79 _extract_output_func(),
81 _activation_layer_func(),
91 _transformed_weights(),
92 _input_weights_product(),
95 _itransformed_output(),
98 _original_weights(
nullptr),
99 _original_bias(
nullptr),
100 _is_activationlayer_enabled(
false),
101 _needs_permute(
false),
◆ CLFFTConvolutionLayer() [2/3]
Prevent instances of this class from being copied (As this class contains pointers)
◆ CLFFTConvolutionLayer() [3/3]
Default move constructor.
◆ configure() [1/2]
Set the input and output tensors.
- Note
- : This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
- Parameters
-
[in] | compile_context | The compile context to be used. |
[in] | input | Source tensor. 3 lower dimensions represent a single input [width, height, IFM], while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. |
[in] | weights | Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as input . |
[in] | biases | Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as input |
[out] | output | Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. Data types supported: Same as input . |
[in] | conv_info | Contains padding and stride information described in PadStrideInfo. |
[in] | act_info | (Optional) Activation layer information in case of a fused activation. |
[in] | enable_fast_math | (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation available which may introduce a drop of accuracy as well. Default is false |
Definition at line 119 of file CLFFTConvolutionLayer.cpp.
130 biases !=
nullptr ? biases->info() :
nullptr,
134 _original_weights = weights;
135 _original_bias = biases;
138 _has_bias = biases !=
nullptr;
146 const Size2D input_dims =
148 const Size2D kernel_size =
149 Size2D(weights->info()->tensor_shape()[
idx_width], weights->info()->tensor_shape()[
idx_height]);
150 const Size2D pad_valid = Size2D(pad_decomposable(input_dims.x() + kernel_size.x() - 1),
151 pad_decomposable(input_dims.y() + kernel_size.y() - 1));
153 ICLTensor *input_to_use =
input;
154 const ICLTensor *weights_to_use = weights;
155 ICLTensor *output_to_use = _has_bias ? &_bias_output : output;
158 if (biases !=
nullptr)
168 _memory_group.
manage(&_permuted_input);
177 input_to_use = &_permuted_input;
178 weights_to_use = &_permuted_weights;
182 _flipped_weights.
allocator()->
init(weights_to_use->info()->clone()->set_is_resizable(
true).reset_padding());
184 _flip_weights_func.
configure(compile_context, weights_to_use, &_flipped_weights, &_flip_axis,
188 const PaddingList padding_w = {{0, input_dims.x() + pad_valid.x() - 1}, {0, input_dims.y() + pad_valid.y() - 1}};
189 _pad_weights_func.
configure(compile_context, &_flipped_weights, &_padded_weights, padding_w);
192 _transform_weights_func = std::make_unique<CLFFT2D>();
193 _transform_weights_func->configure(compile_context, &_padded_weights, &_transformed_weights, FFT2DInfo());
196 const PaddingList padding_in = {{0, kernel_size.x() + pad_valid.x() - 1}, {0, kernel_size.y() + pad_valid.y() - 1}};
197 _memory_group.
manage(&_padded_input);
198 _pad_input_func.
configure(compile_context, input_to_use, &_padded_input, padding_in);
205 _memory_group.
manage(&_transformed_input);
206 _transform_input_func.
configure(compile_context, &_padded_input, &_transformed_input, FFT2DInfo());
210 _memory_group.
manage(&_output_product);
211 _prod_func.
configure(compile_context, &_transformed_input, &_transformed_weights, &_output_product);
215 _memory_group.
manage(&_output_reduced);
220 _memory_group.
manage(&_itransformed_output);
221 FFT2DInfo itranform_info;
224 _output_reduced.
info()->
clone()->set_is_resizable(
true).set_num_channels(1).reset_padding());
225 _itransform_output_func.
configure(compile_context, &_output_reduced, &_itransformed_output, itranform_info);
231 _reshaped_output.
allocator()->
init(_itransformed_output.
info()->
clone()->set_tensor_shape(reshaped_shape));
234 const int start_left = kernel_size.x() -
conv_info.pad_left() - 1;
235 const int start_top = kernel_size.y() -
conv_info.pad_top() - 1;
236 const int end_right =
238 const int end_botton =
242 _memory_group.
manage(&_bias_output);
244 else if (_needs_permute)
246 output_to_use = &_permuted_output;
247 _memory_group.
manage(&_permuted_output);
249 _extract_output_func.
configure(compile_context, &_reshaped_output, output_to_use,
250 Coordinates(start_left, start_top), Coordinates(end_right, end_botton));
254 if (biases !=
nullptr)
256 output_to_use = output;
259 output_to_use = &_permuted_output;
260 _memory_group.
manage(&_permuted_output);
279 _is_activationlayer_enabled =
act_info.enabled();
280 if (_is_activationlayer_enabled)
287 _flip_axis.
map(
true);
288 auto axis_data =
reinterpret_cast<uint32_t *
>(_flip_axis.
buffer());
References arm_compute::test::validation::act_info, CLTensorAllocator::allocate(), CLTensor::allocator(), ARM_COMPUTE_ERROR_THROW_ON, ARM_COMPUTE_LOG_PARAMS, ARM_COMPUTE_UNUSED, arm_compute::auto_init_if_empty(), ICLTensor::buffer(), ICloneable< T >::clone(), TensorInfo::clone(), CLReverse::configure(), CLPermute::configure(), CLSlice::configure(), CLFFT2D::configure(), CLPadLayer::configure(), CLActivationLayer::configure(), CLReductionOperation::configure(), CLArithmeticAddition::configure(), CLComplexPixelWiseMultiplication::configure(), arm_compute::test::validation::conv_info, FFT2DInfo::direction, arm_compute::get_data_layout_dimension_index(), arm_compute::HEIGHT, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, ITensor::info(), CLTensor::info(), ITensorAllocator::init(), arm_compute::test::validation::input, arm_compute::Inverse, MemoryGroup::manage(), CLTensor::map(), arm_compute::NCHW, arm_compute::NHWC, TensorShape::remove_dimension(), TensorInfo::set_data_layout(), arm_compute::SUM, ITensorInfo::tensor_shape(), TensorInfo::tensor_shape(), arm_compute::utils::cast::U, arm_compute::U32, CLTensor::unmap(), CLFFTConvolutionLayer::validate(), arm_compute::WIDTH, arm_compute::WRAP, Size2D::x(), Dimensions< T >::x(), Size2D::y(), and Dimensions< T >::y().
◆ configure() [2/2]
Set the input and output tensors.
Valid data layouts:
Valid data type configurations:
- Note
- : This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
- Parameters
-
[in] | input | Source tensor. 3 lower dimensions represent a single input [width, height, IFM], while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. |
[in] | weights | Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as input . |
[in] | biases | Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as input |
[out] | output | Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. Data types supported: Same as input . |
[in] | conv_info | Contains padding and stride information described in PadStrideInfo. |
[in] | act_info | (Optional) Activation layer information in case of a fused activation. |
[in] | enable_fast_math | (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation available which may introduce a drop of accuracy as well. Default is false |
Definition at line 107 of file CLFFTConvolutionLayer.cpp.
References arm_compute::test::validation::act_info, arm_compute::test::validation::conv_info, CLKernelLibrary::get(), and arm_compute::test::validation::input.
◆ operator=() [1/2]
Default move assignment operator.
◆ operator=() [2/2]
Prevent instances of this class from being copied (As this class contains pointers)
◆ prepare()
Prepare the function for executing.
Any one off pre-processing step required by the function is handled here
- Note
- Prepare stage might not need all the function's buffers' backing memory to be available in order to execute
Reimplemented from IFunction.
Definition at line 385 of file CLFFTConvolutionLayer.cpp.
390 if (_original_bias !=
nullptr)
393 _permute_bias_func.
run();
397 const ICLTensor *cur_weights = _original_weights;
404 _permute_weights_func.
run();
405 cur_weights->mark_as_unused();
406 cur_weights = &_permuted_weights;
411 _flip_weights_func.
run();
412 cur_weights->mark_as_unused();
416 _pad_weights_func.
run();
423 _transform_weights_func->run();
427 _transform_weights_func.reset();
References CLTensorAllocator::allocate(), CLTensor::allocator(), ARM_COMPUTE_ERROR_ON, CLTensorAllocator::free(), CLScheduler::get(), ITensor::is_used(), ITensor::mark_as_unused(), CLScheduler::queue(), ICLSimpleFunction::run(), CLPermute::run(), and CLPadLayer::run().
Referenced by CLFFTConvolutionLayer::run().
◆ run()
Run the kernels contained in the function.
For CPU kernels:
- Multi-threading is used for the kernels which are parallelisable.
- By default std::thread::hardware_concurrency() threads are used.
- Note
- CPPScheduler::set_num_threads() can be used to manually set the number of threads
For OpenCL kernels:
- All the kernels are enqueued on the queue associated with CLScheduler.
- The queue is then flushed.
- Note
- The function will not block until the kernels are executed. It is the user's responsibility to wait.
-
Will call prepare() on first run if hasn't been done
Implements IFunction.
Definition at line 346 of file CLFFTConvolutionLayer.cpp.
350 MemoryGroupResourceScope scope_mg(_memory_group);
355 _permute_input_func.
run();
357 _pad_input_func.
run();
358 _transform_input_func.
run();
365 _itransform_output_func.
run();
367 _extract_output_func.
run();
371 _bias_add_func.
run();
375 _permute_output_func.
run();
379 if (_is_activationlayer_enabled)
381 _activation_layer_func.
run();
References CLTensor::allocator(), CLTensor::cl_buffer(), CLTensorAllocator::import_memory(), CLFFTConvolutionLayer::prepare(), CLFFT2D::run(), CLPermute::run(), CLActivationLayer::run(), CLSlice::run(), CLReductionOperation::run(), CLPadLayer::run(), CLArithmeticAddition::run(), and CLComplexPixelWiseMultiplication::run().
◆ validate()
Static function to check if given info will lead to a valid configuration of CLFFTConvolutionLayer.
- Note
- : This function only works with any square kernel size and unit strides for both NCHW and NHWC data layout
- Parameters
-
[in] | input | Source tensor. 3 lower dimensions represent a single input [width, height, IFM], while every optional dimension from 4 and above represent a batch of inputs. Data types supported: F16/F32. |
[in] | weights | Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as input . |
[in] | biases | Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as input |
[out] | output | Destination tensor. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. Data types supported: Same as input . |
[in] | conv_info | Contains padding and stride information described in PadStrideInfo. |
[in] | act_info | (Optional) Activation layer information in case of a fused activation. |
[in] | enable_fast_math | (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation available which may introduce a drop of accuracy as well. Default is false |
- Returns
- a status
Definition at line 294 of file CLFFTConvolutionLayer.cpp.
311 const Size2D kernel_size = Size2D(weights->tensor_shape()[
idx_width], weights->tensor_shape()[
idx_height]);
318 conv_info.pad_right() != (kernel_size.x() / 2));
320 conv_info.pad_bottom() != (kernel_size.y() / 2));
323 if (biases !=
nullptr)
330 if ((output !=
nullptr) && (output->total_size() != 0))
References arm_compute::test::validation::act_info, ARM_COMPUTE_RETURN_ERROR_ON, ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN, ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES, ARM_COMPUTE_RETURN_ON_ERROR, arm_compute::test::validation::conv_info, arm_compute::F16, arm_compute::F32, arm_compute::get_data_layout_dimension_index(), arm_compute::HEIGHT, arm_compute::test::validation::idx_height, arm_compute::test::validation::idx_width, arm_compute::test::validation::input, ITensorInfo::tensor_shape(), ITensorInfo::total_size(), CLActivationLayer::validate(), arm_compute::WIDTH, Size2D::x(), Dimensions< T >::x(), and Size2D::y().
Referenced by CLFFTConvolutionLayer::configure(), ClConv2d::get_convolution_method(), and CLConvolutionLayer::validate().
The documentation for this class was generated from the following files:
@ NCHW
Num samples, channels, height, width.
void run() override
Run the kernels contained in the function.
std::unique_ptr< ITensorInfo > clone() const override
void unmap()
Enqueue an unmap operation of the allocated and mapped buffer.
void run() override
Run the kernels contained in the function.
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
std::vector< PaddingInfo > PaddingList
List of padding information.
@ NHWC
Num samples, height, width, channels.
void init(const TensorInfo &input, size_t alignment=0)
Initialize a tensor based on the passed TensorInfo.
void run() override
Run the kernels contained in the function.
void configure(ICLTensor *input, ICLTensor *output, ActivationLayerInfo act_info)
Set the input and output tensor.
void map(bool blocking=true)
Enqueue a map operation of the allocated buffer.
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
#define ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(...)
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
uint8_t * buffer() const override
Interface to be implemented by the child class to return a pointer to CPU memory.
Strides PermutationVector
Permutation vector.
void configure(const ICLTensor *input, ICLTensor *output, const Coordinates &starts, const Coordinates &ends)
Configure kernel.
#define ARM_COMPUTE_ERROR_ON(cond)
If the condition is true then an error message is printed and an exception thrown.
void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Initialise the kernel's inputs, output.
Status import_memory(cl::Buffer buffer)
Import an existing memory as a tensor's backing memory.
#define ARM_COMPUTE_ERROR_THROW_ON(status)
@ U32
unsigned 32-bit number
void run() override
Run the kernels contained in the function.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ERROR_ON(cond)
If the condition is true, an error is returned.
void run() override
Run the kernels contained in the function.
bool auto_init_if_empty(ITensorInfo &info, const TensorShape &shape, int num_channels, DataType data_type, QuantizationInfo quantization_info=QuantizationInfo())
Auto initialize the tensor info (shape, number of channels and data type) if the current assignment i...
void mark_as_unused() const
Marks a tensor as unused.
ITensorInfo & set_data_layout(const DataLayout &data_layout) override
Set the data layout of the tensor.
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Set the input and output tensors.
void configure(ICLTensor *input1, ICLTensor *input2, ICLTensor *output, ConvertPolicy policy, const ActivationLayerInfo &act_info=ActivationLayerInfo())
Initialise the kernel's inputs, output and conversion policy.
const cl::Buffer & cl_buffer() const override
Interface to be implemented by the child class to return a reference to the OpenCL buffer containing ...
void run() override
Run the kernels contained in the function.
T x() const
Alias to access the size of the first dimension.
static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ActivationLayerInfo &act_info)
Static function to check if given info will lead to a valid configuration of CLActivationLayer.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Static function to check if given info will lead to a valid configuration of CLFFTConvolutionLayer.
void allocate() override
Allocate size specified by TensorInfo of OpenCL memory.
#define ARM_COMPUTE_UNUSED(...)
To avoid unused variables warnings.
CLTensorAllocator * allocator()
Return a pointer to the tensor's allocator.
void configure(const ICLTensor *input, ICLTensor *output, const PermutationVector &perm)
Set the input and output tensors.
static CLScheduler & get()
Access the scheduler singleton.
size_t get_data_layout_dimension_index(const DataLayout &data_layout, const DataLayoutDimension &data_layout_dimension)
Get the index of the given dimension.
void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *axis, bool use_inverted_axis)
Initialize the function.
void free() override
Free allocated OpenCL memory.
void prepare() override
Prepare the function for executing.
void remove_dimension(size_t n, bool apply_dim_correction=true)
Accessor to remove the dimension n from the tensor shape.
void configure(ICLTensor *input, ICLTensor *output, unsigned int axis, ReductionOperation op, bool keep_dims=true)
Set the input and output tensors.
@ F16
16-bit floating-point number
void configure(ICLTensor *input, ICLTensor *output, const PaddingList &padding, PixelValue constant_value=PixelValue(), PaddingMode mode=PaddingMode::CONSTANT)
Initialize the function.
void run() override
Run the kernels contained in the function.
TensorInfo * info() const override
Interface to be implemented by the child class to return the tensor's metadata.
@ F32
32-bit floating-point number
cl::CommandQueue & queue()
Accessor for the associated CL command queue.
T y() const
Alias to access the size of the second dimension.
#define ARM_COMPUTE_LOG_PARAMS(...)
void configure(const ICLTensor *input, ICLTensor *output, const FFT2DInfo &config)
Initialise the function's source, destinations and border mode.
const TensorShape & tensor_shape() const override
Size for each dimension of the tensor.
void run() override final
Run the kernels contained in the function.