50 Size2D winograd_output_tile(
const Size2D &input_dims,
const Size2D &kernel_dims,
DataLayout data_layout)
52 Size2D output_tile = Size2D{};
54 const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height);
57 const bool is_input_lt4_nchw =
60 if (kernel_max_dim == 3U)
62 if (kernel_dims == Size2D(3U, 3U))
64 output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4
U, 4
U);
66 else if (kernel_dims == Size2D(3U, 1U))
68 output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4
U, 1
U);
72 output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1
U, 4
U);
75 else if (kernel_max_dim == 5U)
77 output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U);
79 else if (kernel_max_dim == 7U)
81 output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U);
87 bool check_support_fast_math(
const Size2D &output_tile,
const Size2D &kernel_size)
90 using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
92 std::vector<WinogradConfiguration> fast_math_winograd = {
93 WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)),
94 WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))};
96 auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
97 std::pair<int, int>(kernel_size.width, kernel_size.height));
99 return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
103 const ITensorInfo *weights,
104 const ITensorInfo *biases,
105 const ITensorInfo *
dst,
107 const ActivationLayerInfo &
act_info,
108 bool enable_fast_math)
116 const Size2D kernel_size = Size2D(weights->tensor_shape()[
idx_width], weights->tensor_shape()[
idx_height]);
117 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size,
src->data_layout());
120 ((
conv_info.pad_left() > (kernel_size.x() / 2u)) || (
conv_info.pad_right() > (kernel_size.x() / 2u))),
121 "Winograd only supports padding up to half kernel size");
123 ((
conv_info.pad_top() > (kernel_size.y() / 2u)) || (
conv_info.pad_bottom() > (kernel_size.y() / 2u))),
124 "Winograd only supports padding up to half kernel size");
127 if (!enable_fast_math)
132 "This Winograd configuration requires enable_fast_math=true");
135 const WinogradInfo winograd_info =
136 WinogradInfo(output_tile, kernel_size, input_dims,
conv_info,
src->data_layout());
139 const TensorShape input0_shape =
141 const TensorInfo input0 =
src->clone()->set_tensor_shape(input0_shape);
145 const TensorShape input1_shape =
147 const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape);
151 TensorShape batched_mm_output_shape = input0.tensor_shape();
152 batched_mm_output_shape[0] = input1.tensor_shape()[0];
153 const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape);
156 GEMMInfo(
false,
false,
true , 0,
false,
false,
175 _batched_mm_output(),
190 bool enable_fast_math)
202 const Size2D output_tile = winograd_output_tile(input_dims, kernel_size,
src->data_layout());
205 if (!enable_fast_math)
210 "This Winograd configuration requires enable_fast_math=true");
215 _is_prepared =
false;
218 _input_transform->configure(compile_context,
src, &_input0, winograd_info);
223 _filter_transform->configure(compile_context, weights, &_input1, winograd_info);
226 _batched_mm.
configure(compile_context, &_input0, &_input1,
nullptr, &_batched_mm_output, 1.0f, 0.0f,
227 GEMMInfo(
false,
false,
true , 0,
false,
false,
232 _output_transform->configure(compile_context, &_batched_mm_output, biases,
dst, winograd_info,
act_info);
236 std::any_of(std::begin(_aux_mem),
std::end(_aux_mem),
237 [](
const auto &r) {
return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); })
239 : MemoryLifetime::Persistent;
251 bool enable_fast_math)
285 _batched_mm.
run(pack_mm);
311 weights->mark_as_unused();
316 _batched_mm.
prepare(mm_prepare_pack);