114 #define _IWEI_WIDTH WEI_WIDTH
115 #define _IWEI_HEIGHT WEI_HEIGHT
116 #define _ISRC_WIDTH SRC_WIDTH
117 #define _ISRC_HEIGHT SRC_HEIGHT
118 #define _ISRC_CHANNELS SRC_CHANNELS
119 #define _IDST_WIDTH DST_WIDTH
120 #define _IDST_HEIGHT DST_HEIGHT
121 #define _IDST_CHANNELS DST_CHANNELS
122 #define _IY_MULTIPLIER (_IWEI_WIDTH * _IWEI_HEIGHT)
124 #if defined(IS_QUANTIZED)
125 #define _IOUTPUT_TILE cq
126 #else // defined(IS_QUANTIZED)
127 #define _IOUTPUT_TILE c
128 #endif // defined(IS_QUANTIZED)
136 TILE(
int, 1, M0, xi);
137 TILE(
int, 1, M0, yi);
138 TILE(
int, 1, M0, xu);
139 TILE(
int, 1, M0, yu);
144 xu[0].s[i] = ((mout + i) %
_IDST_WIDTH) - PAD_LEFT;
146 xi[0].s[i] = ceil(xu[0].s[i] / (
float)STRIDE_X);
147 yi[0].s[i] = ceil(yu[0].s[i] / (
float)STRIDE_Y);
151 TILE(ACC_DATA_TYPE, M0, N0, c);
159 const int x_start =
_IWEI_WIDTH - (xi[0].s[0] * STRIDE_X - xu[0].s[0]) - 1;
160 const int y_start =
_IWEI_HEIGHT - (yi[0].s[0] * STRIDE_Y - yu[0].s[0]) - 1;
162 for(
int yk = y_start, yi_step = 0; yk >= 0; yk -= STRIDE_Y, ++yi_step)
164 for(
int xk = x_start, xi_step = 0; xk >= 0; xk -= STRIDE_X, ++xi_step)
168 TILE(
int, 1, M0, my);
172 int x_s = xi[0].s[i] + xi_step;
173 int y_s = yi[0].s[i] + yi_step;
176 my[0].s[i] =
select(-1, my[0].s[i], x_s >= 0);
178 my[0].s[i] =
select(-1, my[0].s[i], y_s >= 0);
185 TILE(SRC_DATA_TYPE, M0, K0, a);
186 TILE(WEI_DATA_TYPE, N0, K0,
b);
206 T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, K0, NT, T, a,
b, c);
208 #if defined(IS_QUANTIZED)
212 #endif // defined(IS_QUANTIZED)
216 #if defined(LEFTOVER_LOOP)
220 TILE(SRC_DATA_TYPE, M0, 1, a);
221 TILE(WEI_DATA_TYPE, N0, 1,
b);
238 T_MMUL(SRC_DATA_TYPE, WEI_DATA_TYPE, ACC_DATA_TYPE, M0, N0, 1, NT, T, a,
b, c);
240 #if defined(IS_QUANTIZED)
244 #endif // defined(IS_QUANTIZED)
246 #endif // defined(LEFTOVER_LOOP)
250 #if defined(IS_QUANTIZED)
251 const int total_pixels = floor((1 + y_start / (
float)STRIDE_Y)) * floor(1 + x_start / (
float)STRIDE_X);
254 #endif // defined(IS_QUANTIZED)
256 #if defined(HAS_BIAS)
257 TILE(BIA_DATA_TYPE, 1, N0, bias0);
259 T_LOAD(BIA_DATA_TYPE, 1, N0, BUFFER, bia, cout, 0, 1, 0, bias0);
266 #if defined(IS_QUANTIZED)
268 TILE(DST_DATA_TYPE, M0, N0, cq);
271 T_QUANTIZE8_ASYMMETRIC(ACC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, c, cq);
272 #endif // defined(IS_QUANTIZED)
274 TILE(uint, M0, 1, dst_indirect_y);
283 bool x_cond = PARTIAL_N0 != 0 && get_global_id(0) == 0;
286 T_STORE_INDIRECT_WIDTH_SELECT(DST_DATA_TYPE, M0, N0, PARTIAL_N0, DST_TENSOR_TYPE,
dst, cout, dst_stride_y, x_cond,
_IOUTPUT_TILE, dst_indirect_y);
292 #undef _ISRC_CHANNELS
295 #undef _IDST_CHANNELS
296 #undef _IY_MULTIPLIER