37 template<
unsigned int tw
idth,
unsigned int height,
bool sve=false,
typename Tin,
typename Tout>
38 void MergeResults(Tout * out,
const Tin * in,
int ldc,
int y0,
int ymax,
int x0,
int xmax,
const Tout *
bias,
Activation act,
bool append) {
45 const int width = twidth;
47 const int full_y_blocks = (ymax - y0) / height;
48 const int y_remainder = (ymax - y0) % height;
49 const int y_blocks = full_y_blocks + (y_remainder ? 1 : 0);
51 const int full_x_blocks = (xmax - x0) / width;
52 const int x_remainder = (xmax - x0) % width;
53 const int x_blocks = full_x_blocks + (x_remainder ? 1 : 0);
55 for (
int y_block = 0; y_block < y_blocks; y_block++) {
56 int ybase = y0 + (y_block * height);
58 int fill_rows = (y_block < full_y_blocks) ? height : y_remainder;
60 for (
int x_block = 0; x_block < x_blocks; x_block++) {
61 int xbase = x0 + (x_block * width);
63 int fill_cols = (x_block < full_x_blocks) ? width : x_remainder;
65 for (
int row=0; row < fill_rows; row++) {
66 for (
int col=0; col < fill_cols; col++) {
67 Tout &r = out[(ybase + row) * ldc + xbase + col];
68 Tout v = in[row * width + col];
75 v +=
bias[xbase + col];
84 v = std::max(v,
static_cast<Tout
>(0));
88 v = std::max(std::min(v,
static_cast<Tout
>(act.
param1)),
static_cast<Tout
>(0));
96 in += (width * height);
106 template void MergeResults<6u, 8u, false, float, float>(
float *,
float const*,
int,
int,
int,
int,
int,
float const *,
Activation,
bool);
109 #if defined(__aarch64__) && defined(__ARM_FP16_ARGS)
110 template void MergeResults<12u, 8u, false, float, __fp16>(__fp16*,
float const*,
int,
int,
int,
int,
int, __fp16
const*,
Activation,
bool);
113 #if defined(__arm__) && defined(__ARM_FP16_ARGS)
114 template void MergeResults<8u, 6u, false, float, __fp16>(__fp16*,
float const*,
int,
int,
int,
int,
int, __fp16
const*,
Activation,
bool);