29 #ifdef CYCLE_PROFILING 30 #include "profiler.hpp" 38 template <
class strategy>
40 typename strategy::weight_type,
41 typename strategy::return_type>
43 using TInput =
typename strategy::input_type;
44 using TWeight =
typename strategy::weight_type;
45 using TOutput =
typename strategy::return_type;
46 using TAccum =
typename strategy::bias_type;
48 size_t sizeof_input_buffer(
unsigned int n_input_channels)
const 50 return sizeof(TInput) * n_input_channels;
53 size_t sizeof_output_buffer(
unsigned int n_output_channels)
const 55 return sizeof(TOutput) * n_output_channels;
70 const unsigned int vl = arm_gemm::utils::get_vector_length<TInput>(strategy::vl_type);
72 return (1 + this->m_args.kernel_rows * this->m_args.kernel_cols) * rounded_channels *
sizeof(TWeight);
75 void pack_parameters(
void *_buffer,
const void *_biases,
const void *_weights,
size_t ld_weight_col,
size_t ld_weight_row)
override 80 uint8_t *buffer =
static_cast<uint8_t *
>(_buffer);
81 const TAccum *biases =
static_cast<const TAccum *
>(_biases);
82 const TWeight *
const weights =
static_cast<const TWeight *
>(_weights);
84 const unsigned int vl = arm_gemm::utils::get_vector_length<TAccum>(strategy::vl_type);
85 ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels : ld_weight_col;
86 ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
88 for (
unsigned int n = 0; n < this->m_args.input_channels; n += vl)
90 const unsigned int todo = std::min(vl, this->m_args.input_channels - n);
93 for (
unsigned int i = 0; i < todo; i++)
95 reinterpret_cast<TAccum *
>(buffer)[i] = (biases ==
nullptr) ? 0 : biases[n + i];
97 buffer += vl *
sizeof(TAccum);
100 auto weights_row = weights + n;
101 for (
unsigned int i = 0; i < this->m_args.kernel_rows; i++)
103 auto weights_col = weights_row;
105 for (
unsigned int j = 0; j < this->m_args.kernel_cols; j++)
107 for (
unsigned int m = 0; m < todo; m++)
109 reinterpret_cast<TWeight *
>(buffer)[m] = weights_col[m];
111 buffer += vl *
sizeof(TWeight);
113 weights_col += ld_weight_col;
116 weights_row += ld_weight_row;
121 size_t get_working_size(
const unsigned int n_threads,
const unsigned int n_channels)
const override 123 const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
124 return n_threads * (sizeof_output_buffer(n_output_channels) + sizeof_input_buffer(n_channels));
127 using DepthwiseCommon<typename strategy::input_type, typename strategy::weight_type, typename strategy::return_type>::execute;
132 const unsigned int input_channels,
133 const PaddingValues &padding,
134 const void *
const _input,
135 const size_t ld_input_col,
136 const size_t ld_input_row,
137 const size_t ld_input_batch,
139 const unsigned int output_height,
140 const unsigned int output_width,
142 const size_t ld_output_col,
143 const size_t ld_output_row,
144 const size_t ld_output_batch,
145 void *
const _working_space,
146 const unsigned int thread_id,
147 const unsigned int n_threads
150 strategy strat(this->m_args.cpu_info);
151 #ifdef CYCLE_PROFILING 152 arm_gemm::profiler prof;
156 TAccum activation_min, activation_max;
157 std::tie(activation_min, activation_max) = get_default_activation_values<TAccum>();
159 switch (this->m_args.activation.type)
162 activation_max =
static_cast<TAccum
>(this->m_args.activation.param1);
165 activation_min =
static_cast<TAccum
>(0);
173 const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height);
174 const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height);
177 const TInput *
const inptr =
static_cast<const TInput *
>(_input);
178 TOutput *
const outptr =
static_cast<TOutput *
>(_output);
181 const TInput * _inptr_array[strategy::input_rows * strategy::input_cols];
182 const TInput **
const inptr_array = _inptr_array;
185 TOutput * _outptr_array[strategy::output_rows * strategy::output_cols];
186 TOutput **
const outptr_array = _outptr_array;
189 uint8_t *
const working_space =
static_cast<uint8_t *
>(_working_space) +
get_working_size(thread_id, input_channels);
190 TOutput *
const output_buffer =
reinterpret_cast<TOutput *
>(working_space);
191 TInput *
const input_buffer =
reinterpret_cast<TInput *
>(working_space + sizeof_output_buffer(input_channels * this->m_args.channel_multiplier));
194 for (
unsigned int c = 0; c < input_channels; c++)
196 input_buffer[c] =
static_cast<TInput
>(0);
201 for (
unsigned int batch = 0; batch <
batches; batch++)
204 const auto inptr_batch = inptr + batch * ld_input_batch;
205 const auto outptr_batch = outptr + batch * ld_output_batch;
207 for (
int start_out_i = start_out_height;
208 start_out_i < end_out_height;
209 start_out_i +=
static_cast<int>(strategy::output_rows))
211 const int end_out_i = start_out_i + strategy::output_rows;
212 const int start_in_i = start_out_i * strategy::stride_rows - padding.top;
213 const int end_in_i = start_in_i + strategy::input_rows;
216 const auto pad_top =
static_cast<unsigned int>(-std::min(start_in_i, 0));
217 const auto pad_bottom =
static_cast<unsigned int>(-std::min(static_cast<int>(input_height) - end_in_i, 0));
218 const unsigned int valid_output_rows = std::min(
219 end_out_i - start_out_i,
220 static_cast<int>(output_height) - start_out_i
224 for (
auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++)
226 inptr_array[index] = input_buffer;
229 for (
int start_out_j = 0; start_out_j < static_cast<int>(output_width);)
231 const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left;
232 const int pad_left = -std::min(0, start_in_j);
235 int n_direct_tiles = 0;
236 if (!pad_top && !pad_bottom && !pad_left)
239 n_direct_tiles = (output_width - start_out_j) / strategy::output_cols;
243 int end_in_j = start_in_j + n_direct_tiles * strategy::input_cols;
244 int pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
246 while (pad_right && n_direct_tiles)
249 end_in_j -= strategy::input_cols;
250 pad_right = std::max(0, end_in_j - static_cast<int>(input_width));
257 auto inptr = inptr_batch + start_in_i*ld_input_row + start_in_j*ld_input_col;
258 auto outptr = outptr_batch + start_out_i*ld_output_row + start_out_j*ld_output_col;
259 start_out_j += n_direct_tiles*strategy::output_cols;
261 #ifdef CYCLE_PROFILING 262 auto p = prof.ScopedProfiler(PROFILE_KERNEL, 0);
264 strat.direct_kernel(1, n_direct_tiles,
265 inptr, ld_input_row, ld_input_col,
266 outptr, ld_output_row, ld_output_col,
267 parameters, this->m_args.input_channels,
268 activation_min, activation_max);
272 const int end_out_j = start_out_j + strategy::output_cols;
273 const int end_in_j = start_in_j + strategy::input_cols;
275 const auto pad_right =
static_cast<unsigned int>(-std::min(static_cast<int>(input_width) - end_in_j, 0));
276 const unsigned int valid_output_cols = std::min(
277 end_out_j - start_out_j,
278 static_cast<int>(output_width) - start_out_j
283 for (
auto i = pad_top; i < strategy::input_rows - pad_bottom; i++)
287 unsigned int j = pad_left;
288 const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col;
289 const TInput **ptrs = inptr_array + i * strategy::input_cols + j;
290 for (; j < strategy::input_cols - pad_right; j++)
293 colptr += ld_input_col;
295 for (; j < strategy::input_cols; j++)
297 *(ptrs++) = input_buffer;
302 TOutput **outptr_pos = outptr_array;
303 for (
auto i = 0u; i < valid_output_rows; i++)
306 TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col;
307 for (; j < valid_output_cols; j++)
309 *(outptr_pos++) = colptr;
310 colptr += ld_output_col;
312 for (; j < strategy::output_cols; j++)
314 *(outptr_pos++) = output_buffer;
317 for (
auto i = valid_output_rows; i < strategy::output_rows; i++)
319 for (
auto j = 0u; j < strategy::output_cols; j++)
321 *(outptr_pos++) = output_buffer;
325 start_out_j += strategy::output_cols;
327 #ifdef CYCLE_PROFILING 329 auto p = prof.ScopedProfiler(PROFILE_KERNEL, (
unsigned long)(0));
331 strat.indirect_kernel(inptr_array, outptr_array, parameters,
332 this->m_args.input_channels, activation_min, activation_max);
T roundup(const T a, const T b)
DepthwiseDepthfirst & operator=(DepthwiseDepthfirst &)=delete
T iceildiv(const T a, const T b)
size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
std::unique_ptr< ParametersLibrary > parameters
void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
const size_t input_height
DepthwiseDepthfirst(const DepthwiseArgs &args)
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
size_t get_storage_size(void) const override
void execute(const unsigned int batches, const unsigned int input_height, const unsigned int input_width, const unsigned int input_channels, const PaddingValues &padding, const void *const _input, const size_t ld_input_col, const size_t ld_input_row, const size_t ld_input_batch, const void *const parameters, const unsigned int output_height, const unsigned int output_width, void *const _output, const size_t ld_output_col, const size_t ld_output_row, const size_t ld_output_batch, void *const _working_space, const unsigned int thread_id, const unsigned int n_threads) const override