24 #ifndef SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
25 #define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H
32 struct ConvolutionInfo;
71 uint32_t depth_multiplier = 1)
81 (
input.padding().bottom +
input.padding().top) *
input.strides_in_bytes().y()),
104 const int32_t current_h = base_h + h * dilation.
y();
105 const bool is_valid_h = current_h >= 0 && current_h < static_cast<int32_t>(run_info.
input_height);
107 const int32_t current_w = base_w +
w * dilation.
x();
108 const bool is_valid_w = current_w >= 0 && current_w < static_cast<int32_t>(run_info.
input_width);
110 return is_valid_h && is_valid_w;
113 template <
typename T>
123 constexpr
auto element_per_vector =
vector_size /
sizeof(T);
129 const VectorType zero_vector =
wrapper::vdup_n(
static_cast<T
>(0), TagType{});
131 Window execution_window = window;
134 Window win_input = window;
139 Window win_weights = win_input;
142 Window win_output = window;
146 Iterator weights_it(weights, win_weights);
152 biases_it =
Iterator(biases, win_weights);
159 const int32_t input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
160 const int32_t input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
161 const int64_t base_input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
163 auto const base_weights_ptr = weights_it.
ptr();
164 uint32_t x = run_info.x_start;
166 for (; x < run_info.x_leftover_start; x += run_info.x_step)
168 VectorType acc = zero_vector;
169 auto weights_ptr = base_weights_ptr;
170 int64_t input_offset = base_input_offset;
172 for (uint32_t h = 0; h < run_info.weights_height; ++h)
174 int64_t offs = input_offset + x *
sizeof(T);
175 for (uint32_t
w = 0;
w < run_info.weights_width; ++
w)
178 const auto input_vals =
181 input_it.
ptr() + std::min(
static_cast<size_t>(offs), run_info.input_max_offset)))
183 const auto weights_vals =
184 wrapper::vload(
reinterpret_cast<T *
>(weights_ptr +
w * run_info.weights_stride_y) + x);
187 offs += dilation.
x() * run_info.input_stride_y;
190 weights_ptr += run_info.weights_stride_z;
191 input_offset += dilation.
y() * run_info.input_stride_z;
196 const auto biases_vals =
wrapper::vload(
reinterpret_cast<T *
>(biases_it.ptr()) + x);
203 for (; x < run_info.x_end; ++x)
205 auto acc_scalar = T{0};
206 auto weights_ptr = base_weights_ptr;
207 int64_t input_offset = base_input_offset;
209 for (
size_t h = 0; h < run_info.weights_height; ++h)
211 int64_t offs = input_offset + x *
sizeof(T);
212 for (
size_t w = 0;
w < run_info.weights_width; ++
w)
215 const auto input_vals =
217 ? *
reinterpret_cast<T *
>(input_it.
ptr() +
218 std::min(
static_cast<size_t>(offs), run_info.input_max_offset))
220 const auto weights_vals =
221 *(
reinterpret_cast<T *
>(weights_ptr +
w * run_info.weights_stride_y) + x);
223 acc_scalar += (input_vals * weights_vals);
225 offs += dilation.
x() * run_info.input_stride_y;
228 weights_ptr += run_info.weights_stride_z;
229 input_offset += dilation.
y() * run_info.input_stride_z;
234 const auto biases_vals = *(
reinterpret_cast<T *
>(biases_it.ptr()) + x);
235 acc_scalar += biases_vals;
237 *(
reinterpret_cast<T *
>(output_it.
ptr()) + x) = acc_scalar;
240 input_it, weights_it, biases_it, output_it);
243 template <
typename T>
250 unsigned int depth_multiplier,
254 const auto run_info =
257 Window execution_window = window;
260 Window win_input = execution_window;
265 Window win_weights = window;
271 Window win_output = window;
275 Iterator weights_it(weights, win_weights);
281 biases_it =
Iterator(biases, win_weights);
288 std::vector<T> acc(depth_multiplier,
static_cast<T
>(0));
290 const int input_y =
id.y() * run_info.conv_stride_x - run_info.conv_pad_left;
291 const int input_z =
id.z() * run_info.conv_stride_y - run_info.conv_pad_top;
292 int input_offset = input_y * run_info.input_stride_y + input_z * run_info.input_stride_z;
294 auto weights_ptr = weights_it.
ptr();
295 for (
size_t h = 0; h < run_info.weights_height; ++h)
297 int offs = input_offset;
298 for (
size_t w = 0;
w < run_info.weights_width; ++
w)
301 const auto input_val =
302 is_valid_region ? *(
reinterpret_cast<T *
>(input_it.
ptr() + std::min(
static_cast<size_t>(offs),
303 run_info.input_max_offset)))
306 for (
size_t m = 0; m < depth_multiplier; ++m)
308 const auto weights_val =
309 *(
reinterpret_cast<T *
>(weights_ptr + m *
sizeof(T) +
w * run_info.weights_stride_y));
313 offs += dilation.
x() * run_info.input_stride_y;
316 weights_ptr += run_info.weights_stride_z;
317 input_offset += dilation.
y() * run_info.input_stride_z;
322 for (
size_t m = 0; m < depth_multiplier; ++m)
324 const auto biases_val = *(
reinterpret_cast<T *
>(biases_it.ptr() + m *
sizeof(T)));
325 *(
reinterpret_cast<T *
>(output_it.
ptr() + m *
sizeof(T))) = acc.at(m) + biases_val;
330 for (
size_t m = 0; m < depth_multiplier; ++m)
332 *(
reinterpret_cast<T *
>(output_it.
ptr() + m *
sizeof(T))) = acc.at(m);
336 input_it, weights_it, biases_it, output_it);
339 template <
typename T,
typename TW>
349 unsigned int depth_multiplier =
info.depth_multiplier;
352 if (depth_multiplier == 1)
354 depthwise_loop_multiplier1_fp<T>(
src, weights, biases,
dst,
conv_info, dilation, window, has_biases);
358 depthwise_loop_generic_fp<T>(
src, weights, biases,
dst,
conv_info, dilation, depth_multiplier, window,
363 template <
typename T,
typename TW>
374 #endif //define SRC_CORE_KERNELS_DEPTWISECONV2DNATIVE_IMPL_H