48 const unsigned int kernel_max_dim = std::max(kernel_dims.
width, kernel_dims.
height);
53 if(kernel_max_dim == 3U)
55 if(kernel_dims ==
Size2D(3U, 3U))
57 output_tile = is_input_lt4_nchw ?
Size2D(2U, 2U) : Size2D(4
U, 4
U);
59 else if(kernel_dims ==
Size2D(3U, 1U))
61 output_tile = is_input_lt4_nchw ?
Size2D(2U, 1U) : Size2D(4
U, 1
U);
65 output_tile = is_input_lt4_nchw ?
Size2D(1U, 2U) : Size2D(1
U, 4
U);
68 else if(kernel_max_dim == 5U)
70 output_tile =
Size2D(kernel_dims.
width == 1 ? 1U : 4U,
71 kernel_dims.
height == 1 ? 1U : 4U);
73 else if(kernel_max_dim == 7U)
75 output_tile =
Size2D(kernel_dims.
width == 1 ? 1U : 2U,
76 kernel_dims.
height == 1 ? 1U : 2U);
82 bool check_support_fast_math(
const Size2D &output_tile,
const Size2D &kernel_size)
85 using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
87 std::vector<WinogradConfiguration> fast_math_winograd =
89 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
90 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))
93 auto p = std::make_pair(std::pair<int, int>(output_tile.
width, output_tile.
height),
94 std::pair<int, int>(kernel_size.
width, kernel_size.
height));
96 return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
109 bool enable_fast_math)
125 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->
info()->
data_layout());
128 if(!enable_fast_math)
131 ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size),
"This Winograd configuration requires enable_fast_math=true");
139 _is_prepared =
false;
140 _original_weights = weights;
143 _memory_group.
manage(&_input0);
144 _memory_group.
manage(&_batched_mm_output);
149 _input_transform.
configure(compile_context, input, &_input0, winograd_info);
152 _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
155 _batched_mm.
configure(compile_context, &_input0, &_input1,
nullptr, &_batched_mm_output, 1.0f, 0.0f,
GEMMInfo(
false,
false,
true , 0,
false,
false,
160 _output_transform->configure(compile_context, &_batched_mm_output, biases, output, winograd_info, act_info);
177 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, input->
data_layout());
183 if(!enable_fast_math)
197 const TensorInfo input0 = input->
clone()->set_tensor_shape(input0_shape);
202 const TensorInfo input1 = weights->
clone()->set_tensor_shape(input1_shape);
208 const TensorInfo batched_mm_output = input0.
clone()->set_tensor_shape(batched_mm_output_shape);
209 ARM_COMPUTE_RETURN_ON_ERROR(
CLGEMM::validate(&input0, &input1,
nullptr, &batched_mm_output, 1.0f, 0.0f,
GEMMInfo(
false,
false,
true , 0,
false,
false,
225 _input_transform.
run();
void prepare() override
Prepare the function for executing.
TensorShape compute_winograd_input_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
Calculate the winograd input transform shape.
std::unique_ptr< ITensorInfo > clone() const override
Provide a clone of the current object of class T.
void run() override
Run the kernels contained in the function.
CLWinogradConvolutionLayer(std::shared_ptr< IMemoryManager > memory_manager=nullptr)
Default constructor.
static CLScheduler & get()
Access the scheduler singleton.
void run() override
Run the kernels contained in the function.
#define ARM_COMPUTE_RETURN_ON_ERROR(status)
Checks if a status contains an error and returns it.
virtual DataType data_type() const =0
Data type used for each element of the tensor.
bool is_used() const
Flags if the tensor is used or not.
1 channel, 1 F32 per channel
void prepare() override
Prepare the function for executing.
const DataLayout data_layout
static CLKernelLibrary & get()
Access the KernelLibrary singleton.
Store the tensor's metadata.
CLTensorAllocator * allocator()
Return a pointer to the tensor's allocator.
unsigned int pad_top() const
Get the top padding.
Activation Layer Information class.
Copyright (c) 2017-2021 Arm Limited.
size_t height
Height of the image region or rectangle.
1 channel, 1 F16 per channel
void mark_as_unused() const
Marks a tensor as unused.
void manage(IMemoryManageable *obj) override
Sets a object to be managed by the given memory group.
Interface to enqueue OpenCL kernels and get/set the OpenCL CommandQueue and ICLTuner.
void run() override final
Run the kernels contained in the function.
virtual const TensorShape & tensor_shape() const =0
Size for each dimension of the tensor.
#define ARM_COMPUTE_ERROR_ON_MSG(cond, msg)
virtual std::unique_ptr< T > clone() const =0
Provide a clone of the current object of class T.
GEMMLowp output stage info.
virtual ITensorInfo * info() const =0
Interface to be implemented by the child class to return the tensor's metadata.
unsigned int pad_right() const
Get the right padding.
Padding and stride information class.
TensorShape compute_winograd_filter_transform_shape(const ITensorInfo &input, const WinogradInfo &winograd_info)
Calculate the winograd filter transform shape.
static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Static function to check if given info will lead to a valid configuration of CLWinogradConvolutionLay...
cl::CommandQueue & queue()
Accessor for the associated CL command queue.
void enqueue(ICLKernel &kernel, bool flush=true)
Schedule the execution of the passed kernel if possible.
Num samples, channels, height, width.
#define ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
~CLWinogradConvolutionLayer()
Default destructor.
void allocate() override
Allocate size specified by TensorInfo of OpenCL memory.
Memory group resources scope handling class.
Interface for OpenCL tensor.
size_t width
Width of the image region or rectangle.
void configure(const ICLTensor *a, const ICLTensor *b, const ICLTensor *c, ICLTensor *output, float alpha, float beta, const GEMMInfo &gemm_info=GEMMInfo())
Initialise the kernel's inputs and output.
Class for specifying the size of an image or rectangle.
#define ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(t, c,...)
void free() override
Free allocated OpenCL memory.
void configure(ICLTensor *input, const ICLTensor *weights, const ICLTensor *biases, ICLTensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info=ActivationLayerInfo(), bool enable_fast_math=false)
Set the input and output tensors.
static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info=GEMMInfo())
Static function to check if given info will lead to a valid configuration of CLGEMM.
#define ARM_COMPUTE_RETURN_ERROR_ON_MSG(cond, msg)
If the condition is true, an error is returned.
Store the tensor's metadata.
size_t get_data_layout_dimension_index(const DataLayout data_layout, const DataLayoutDimension data_layout_dimension)
Get the index of the given dimension.
unsigned int pad_bottom() const
Get the bottom padding.
const TensorShape & tensor_shape() const override
Size for each dimension of the tensor.
unsigned int pad_left() const
Get the left padding.
DataLayout
[DataLayout enum definition]
virtual DataLayout data_layout() const =0
Get the data layout of the tensor.