32 template <
class strategy>
34 public DepthwiseCommon<typename strategy::input_type,
35 typename strategy::weight_type,
36 typename strategy::return_type>
38 using Parent = DepthwiseCommon<
typename strategy::input_type,
39 typename strategy::weight_type,
40 typename strategy::return_type>;
41 using TInput =
typename strategy::input_type;
42 using TWeight =
typename strategy::weight_type;
43 using TOutput =
typename strategy::return_type;
47 size_t sizeof_output_buffer(
unsigned int n_channels)
const 49 const unsigned int vl = arm_gemm::utils::get_vector_length<typename strategy::return_type>(strategy::vl_type);
51 return sizeof(
typename strategy::return_type) * rounded_channels;
56 : Parent(args), m_qp(qp)
68 const unsigned int iter_length =
69 arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
70 const unsigned int n_iters =
71 this->m_args.input_channels *
arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
74 const unsigned int n_dots_per_kernel_row =
arm_gemm::iceildiv(strategy::kernel_cols, 4u);
76 return n_iters * iter_length * (
78 4 * n_dots_per_kernel_row * strategy::kernel_rows *
sizeof(TWeight) +
85 void pack_parameters(
void *_buffer,
const void *_biases,
const void *_weights,
size_t ld_weight_col,
size_t ld_weight_row)
override 87 auto buffer =
static_cast<uint8_t *
>(_buffer);
88 auto biases =
static_cast<const int32_t *
>(_biases);
89 auto weights =
static_cast<const TWeight *
>(_weights);
93 const unsigned int iter_length =
94 arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
95 const unsigned int n_iters_per_input_channel =
98 const unsigned int n_dots_per_kernel_row =
arm_gemm::iceildiv(strategy::kernel_cols, 4u);
100 const size_t iter_stride = iter_length * (
102 4 * n_dots_per_kernel_row * strategy::kernel_rows *
sizeof(int8_t) +
106 ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels * this->m_args.channel_multiplier : ld_weight_col;
107 ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row;
109 for (
unsigned int input_channel = 0; input_channel < this->m_args.input_channels; input_channel++)
111 auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
112 auto weights_input_channel = weights + input_channel * this->m_args.channel_multiplier;
114 for (
unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
119 auto buffer_base = buffer_input_channel + iter_stride * iter;
120 auto buffer_biases =
reinterpret_cast<int32_t *
>(buffer_base);
121 auto buffer_weights = buffer_base +
sizeof(int32_t) * iter_length;
122 auto buffer_requant_mul =
reinterpret_cast<int32_t *
>(
123 buffer_weights + strategy::kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
124 auto buffer_requant_shift = buffer_requant_mul + iter_length;
125 auto weights_base = weights_input_channel + iter * iter_length;
129 const auto this_iter_length = std::min<unsigned int>(
130 iter_length, this->m_args.channel_multiplier - iter * iter_length
132 for (
unsigned int i = 0; i < this_iter_length; i++)
134 auto weights_channel = weights_base + i;
137 auto bias_value = biases ==
nullptr ? 0 : *(biases++);
138 int32_t elements_sum = 0;
142 for (
unsigned int ki = 0; ki < strategy::kernel_rows; ki++)
144 auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
145 auto weights_row = weights_channel + ki * ld_weight_row;
148 for (; kj < strategy::kernel_cols; kj++)
151 const auto dot = kj / 4;
152 const auto elem = kj % 4;
155 const auto val = weights_row[kj * ld_weight_col];
156 buffer_row[dot * 4 * iter_length + elem] = val;
159 for (; kj < 4 * n_dots_per_kernel_row; kj++)
161 const auto dot = kj / 4;
162 const auto elem = kj % 4;
163 buffer_row[dot * 4 * iter_length + elem] = 0;
166 buffer_row += 4 * n_dots_per_kernel_row * iter_length;
171 bias_value - m_qp.
a_offset * elements_sum +
172 strategy::kernel_rows * strategy::kernel_cols * m_qp.
a_offset * m_qp.
b_offset;
182 size_t get_working_size(
const unsigned int n_threads,
const unsigned int n_channels)
const override 184 const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier;
185 return n_threads * sizeof_output_buffer(n_output_channels);
188 using Parent::execute;
193 const unsigned int input_channels,
194 const PaddingValues &padding,
195 const void *
const _input,
196 const size_t ld_input_col,
197 const size_t ld_input_row,
198 const size_t ld_input_batch,
200 const unsigned int output_height,
201 const unsigned int output_width,
203 const size_t ld_output_col,
204 const size_t ld_output_row,
205 const size_t ld_output_batch,
206 void *
const _working_space,
207 const unsigned int thread_id,
208 const unsigned int n_threads
211 strategy strat(this->m_args.cpu_info);
212 #ifdef CYCLE_PROFILING 213 arm_gemm::profiler prof;
216 auto executefn = [strat,
this] (
217 const TInput *
const *
const inptrs,
219 const void *
const params
221 strat.kernel(inptrs, outptr_array, params, this->m_args.channel_multiplier, m_qp);
225 uint8_t *
const working_space =
static_cast<uint8_t *
>(_working_space) +
get_working_size(1, input_channels) * thread_id;
228 const unsigned int iter_length =
229 arm_gemm::utils::get_vector_length<int32_t>(strategy::vl_type);
230 const unsigned int n_iters_per_input_channel =
arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length);
231 const unsigned int n_dots_per_kernel_row =
arm_gemm::iceildiv(strategy::kernel_cols, 4u);
232 const size_t param_stride = n_iters_per_input_channel * iter_length * (
234 4 * n_dots_per_kernel_row * strategy::kernel_rows *
sizeof(int8_t) +
238 common::depthwise_multiplier_execute<strategy>(
239 executefn, m_qp.
a_offset, this->m_args,
241 _input, ld_input_col, ld_input_row, ld_input_batch,
243 output_height, output_width,
244 _output, ld_output_col, ld_output_row, ld_output_batch,
245 working_space, thread_id, n_threads
T roundup(const T a, const T b)
T iceildiv(const T a, const T b)
size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override
const size_t input_height
void execute(const unsigned int batches, const unsigned int input_height, const unsigned int input_width, const unsigned int input_channels, const PaddingValues &padding, const void *const _input, const size_t ld_input_col, const size_t ld_input_row, const size_t ld_input_batch, const void *const parameters, const unsigned int output_height, const unsigned int output_width, void *const _output, const size_t ld_output_col, const size_t ld_output_row, const size_t ld_output_batch, void *const _working_space, const unsigned int thread_id, const unsigned int n_threads) const override
int32_t per_layer_right_shift
void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override
size_t get_storage_size(void) const override
std::unique_ptr< ParametersLibrary > parameters
DepthwiseDepthfirstWithMultiplierQuantized & operator=(DepthwiseDepthfirstWithMultiplierQuantized &)=delete
DepthwiseDepthfirstWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp)
const StratType * strategy
const int32_t * per_channel_right_shifts
const int32_t * requant_muls
template UniqueDepthwiseCommon< float > depthwise(const DepthwiseArgs &, const Nothing &)
const int32_t * requant_shifts
const int32_t * per_channel_muls