30 namespace interleaves {
34 const DepthwiseArgs &
args,
36 const unsigned int accumulator_depth_vl
42 const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
48 return n_iters * iter_length * (
50 4 * n_dots_per_kernel_row *
args.kernel_rows *
sizeof(int8_t) +
57 void *_buffer,
const int32_t *biases,
58 const T *weights,
size_t ld_weight_col,
size_t ld_weight_row,
59 const DepthwiseArgs &
args,
62 const unsigned int accumulator_depth_vl
65 auto buffer =
static_cast<uint8_t *
>(_buffer);
69 const unsigned int iter_length = accumulator_depth_vl * arm_gemm::utils::get_vector_length<int32_t>(vl_type);
73 const size_t iter_stride = iter_length * (
75 4 * n_dots_per_kernel_row *
args.kernel_rows *
sizeof(T) +
79 ld_weight_col = (ld_weight_col == 0) ?
args.input_channels *
args.channel_multiplier : ld_weight_col;
80 ld_weight_row = (ld_weight_row == 0) ?
args.kernel_cols * ld_weight_col : ld_weight_row;
82 for (
unsigned int input_channel = 0; input_channel <
args.input_channels; input_channel++)
84 auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride;
85 auto weights_input_channel = weights + input_channel *
args.channel_multiplier;
87 for (
unsigned int iter = 0; iter < n_iters_per_input_channel; iter++)
92 auto buffer_base = buffer_input_channel + iter_stride * iter;
93 auto buffer_biases =
reinterpret_cast<int32_t *
>(buffer_base);
94 auto buffer_weights = buffer_base +
sizeof(int32_t) * iter_length;
95 auto buffer_requant_mul =
reinterpret_cast<int32_t *
>(
96 buffer_weights +
args.kernel_rows * n_dots_per_kernel_row * 4 * iter_length);
97 auto buffer_requant_shift = buffer_requant_mul + iter_length;
98 auto weights_base = weights_input_channel + iter * iter_length;
102 const auto this_iter_length = std::min<unsigned int>(
103 iter_length,
args.channel_multiplier - iter * iter_length
105 for (
unsigned int i = 0; i < this_iter_length; i++)
107 auto weights_channel = weights_base + i;
110 auto bias_value = biases ==
nullptr ? 0 : *(biases++);
111 int32_t elements_sum = 0;
115 for (
unsigned int ki = 0; ki <
args.kernel_rows; ki++)
117 auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length;
118 auto weights_row = weights_channel + ki * ld_weight_row;
121 for (; kj <
args.kernel_cols; kj++)
124 const auto dot = kj / 4;
125 const auto elem = kj % 4;
128 const auto val = weights_row[kj * ld_weight_col];
129 buffer_row[dot * 4 * iter_length + elem] = val;
132 for (; kj < 4 * n_dots_per_kernel_row; kj++)
134 const auto dot = kj / 4;
135 const auto elem = kj % 4;
136 buffer_row[dot * 4 * iter_length + elem] = 0;
139 buffer_row += 4 * n_dots_per_kernel_row * iter_length;
144 bias_value - qp.
a_offset * elements_sum +