Compute Library
 21.05
depthwise_convolution_quantized.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2017-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "helpers_asymm.h"
26 
27 #ifndef VEC_SIZE
28 #if defined(N0)
29 #define VEC_SIZE N0
30 #else /* defined(N0) */
31 #define VEC_SIZE 8
32 #endif /* defined(N0) */
33 #endif /* VEC_SIZE */
34 
35 #if defined(ACTIVATION_TYPE) && defined(CONST_0)
37 #define ACTIVATION_FUNC(x) PERFORM_ACTIVATION_QUANT(ACTIVATION_TYPE, x)
38 #else /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
39 #define ACTIVATION_FUNC(x) (x)
40 #endif /* defined(ACTIVATION_TYPE) && defined(CONST_0) */
41 
42 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
43 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
44 #define VEC_SHORT VEC_DATA_TYPE(short, VEC_SIZE)
45 
46 #if defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
47 
48 #define VEC_TYPE(size) VEC_DATA_TYPE(DATA_TYPE, size)
49 
50 #if defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
51 
52 #if defined(WEIGHTS_PROMOTED_TYPE)
53 #define VEC_WEIGHTS_PROMOTED_TYPE(size) VEC_DATA_TYPE(WEIGHTS_PROMOTED_TYPE, size)
54 
55 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
56 #if defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
57 #define ARM_DOT(x, y, val) val = arm_dot_acc((x), (y), val);
58 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
59 #define ARM_DOT(x, y, val) val += arm_dot((x), (y));
60 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
61 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
62 
63 #if defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS)
64 
65 #if CONV_STRIDE_X > 3
66 #error "Stride X not supported"
67 #endif /* CONV_STRIDE_X > 3 */
68 
69 #if !defined(IS_DOT8)
70 
71 #if DILATION_X == 1
72 
73 #if CONV_STRIDE_X == 1
74 #define GET_VALUES(first_value, left, middle, right) \
75  ({ \
76  int8 temp0 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \
77  int2 temp1 = CONVERT(vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))), int2); \
78  \
79  left = CONVERT(temp0.s01234567, int8); \
80  middle = CONVERT((int8)(temp0.s1234, temp0.s567, temp1.s0), int8); \
81  right = CONVERT((int8)(temp0.s2345, temp0.s67, temp1.s01), int8); \
82  })
83 #elif CONV_STRIDE_X == 2
84 #define GET_VALUES(first_value, left, middle, right) \
85  ({ \
86  int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \
87  int temp1 = CONVERT(*((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int); \
88  \
89  left = CONVERT(temp0.s02468ace, int8); \
90  middle = CONVERT(temp0.s13579bdf, int8); \
91  right = CONVERT((int8)(temp0.s2468, temp0.sace, temp1), int8); \
92  })
93 #else /* CONV_STRIDE_X */
94 #define GET_VALUES(first_value, left, middle, right) \
95  ({ \
96  int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \
97  int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \
98  \
99  left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
100  middle = CONVERT((int8)(temp0.s147a, temp0.sd, temp1.s036), int8); \
101  right = CONVERT((int8)(temp0.s258b, temp0.se, temp1.s147), int8); \
102  })
103 #endif /* CONV_STRIDE_X */
104 
105 #else /* DILATION_X == 1 */
106 
107 #if CONV_STRIDE_X == 1
108 #define GET_VALUES(first_value, left, middle, right) \
109  ({ \
110  left = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value)), int8); \
111  middle = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int8); \
112  right = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int8); \
113  })
114 #elif CONV_STRIDE_X == 2
115 #define GET_VALUES(first_value, left, middle, right) \
116  ({ \
117  int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \
118  left = CONVERT(temp0.s02468ace, int8); \
119  \
120  temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \
121  middle = CONVERT(temp0.s02468ace, int8); \
122  \
123  temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \
124  right = CONVERT(temp0.s02468ace, int8); \
125  })
126 #else /* CONV_STRIDE_X */
127 #define GET_VALUES(first_value, left, middle, right) \
128  ({ \
129  int16 temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value)), int16); \
130  int8 temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))), int8); \
131  left = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
132  \
133  temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))), int16); \
134  temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))), int8); \
135  middle = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
136  \
137  temp0 = CONVERT(vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))), int16); \
138  temp1 = CONVERT(vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))), int8); \
139  right = CONVERT((int8)(temp0.s0369, temp0.scf, temp1.s25), int8); \
140  })
141 
142 #endif /* CONV_STRIDE_X */
143 #endif /* DILATION_X==1 */
144 
145 /** This function computes the depthwise convolution quantized.
146  *
147  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
148  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
149  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
150  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
151  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
152  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
153  * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
154  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
155  * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
156  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
157  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
158  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
159  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
160  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
161  * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
162  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
163  * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
164  * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
165  * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
166  * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
167  * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
168  * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
169  * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
170  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
171  * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32
172  * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)
173  * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
174  * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
175  * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32
176  * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)
177  * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
178  * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector
179  * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32
180  * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
181  * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
182  * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
183  */
184 
185 __kernel void dwc_3x3_native_quantized8_nchw(
188  TENSOR3D_DECLARATION(weights),
189  VECTOR_DECLARATION(output_multipliers),
190  VECTOR_DECLARATION(output_shifts)
191 #if defined(HAS_BIAS)
192  ,
193  VECTOR_DECLARATION(biases)
194 #endif //defined(HAS_BIAS)
195 )
196 {
197  __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
199  Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
200  Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
201  Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
202 
203  // Extract channel and linearized batch indices
204  const int channel = get_global_id(2) % DST_CHANNELS;
205  const int batch = get_global_id(2) / DST_CHANNELS;
206 
207 #if defined(HAS_BIAS)
208  Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
209 
210  int bias_value = *((__global int *)(vector_offset(&biases, channel)));
211 #endif //defined(HAS_BIAS)
212 
213  // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
214  src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
215  __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
216 
217  VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
218  w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
219  VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
220  w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
221  VEC_DATA_TYPE(WEIGHTS_TYPE, 3)
222  w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
223 
224 #if defined(PER_CHANNEL_QUANTIZATION)
225  const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, channel));
226  const int output_shift = *((__global int *)vector_offset(&output_shifts, channel));
227 #endif // defined(PER_CHANNEL_QUANTIZATION)
228 
229  int8 values0 = 0;
230  int8 sum0 = 0;
231 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
232  int8 values1 = 0;
233  int8 sum1 = 0;
234 #endif /* CONV_STRIDE_Y &&DILATION_Y==1 */
235 
236  // Row0
237  int8 left, middle, right;
238  GET_VALUES(src_addr + 0 * src_stride_y, left, middle, right);
239  values0 += left * (int8)(w0.s0);
240  values0 += middle * (int8)(w0.s1);
241  values0 += right * (int8)(w0.s2);
242 
243 #if WEIGHTS_OFFSET != 0
244  sum0 += left + middle + right;
245 #endif /* WEIGHTS_OFFSET != 0 */
246 
247  // Row1
248  GET_VALUES(src_addr + DILATION_Y * src_stride_y, left, middle, right);
249  values0 += left * (int8)(w1.s0);
250  values0 += middle * (int8)(w1.s1);
251  values0 += right * (int8)(w1.s2);
252 
253 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
254  values1 += left * (int8)(w0.s0);
255  values1 += middle * (int8)(w0.s1);
256  values1 += right * (int8)(w0.s2);
257 #endif /* CONV_STRIDE_Y && DILATION_Y== 1 */
258 
259 #if WEIGHTS_OFFSET != 0
260  int8 tmp = left + middle + right;
261  sum0 += tmp;
262 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
263  sum1 += tmp;
264 #endif /* CONV_STRIDE_Y &&DILATION_Y== 1 */
265 #endif /* WEIGHTS_OFFSET != 0 */
266 
267  // Row2
268  GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left, middle, right);
269  values0 += left * (int8)(w2.s0);
270  values0 += middle * (int8)(w2.s1);
271  values0 += right * (int8)(w2.s2);
272 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
273  values1 += left * (int8)(w1.s0);
274  values1 += middle * (int8)(w1.s1);
275  values1 += right * (int8)(w1.s2);
276 #endif /* CONV_STRIDE_Y &&DILATION_Y == 1 */
277 
278 #if WEIGHTS_OFFSET != 0
279  tmp = left + middle + right;
280  sum0 += tmp;
281 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
282  sum1 += tmp;
283 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
284 #endif /* WEIGHTS_OFFSET != 0 */
285 
286 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
287  // Row3
288  GET_VALUES(src_addr + 3 * src_stride_y, left, middle, right);
289  values1 += left * (int8)(w2.s0);
290  values1 += middle * (int8)(w2.s1);
291  values1 += right * (int8)(w2.s2);
292 
293 #if WEIGHTS_OFFSET != 0
294  sum1 += left + middle + right;
295 #endif /* WEIGHTS_OFFSET != 0 */
296 #endif /* CONV_STRIDE_Y && DILATION_Y == 1 */
297 
298 #if defined(HAS_BIAS)
299  values0 += (int8)(bias_value);
300 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
301  values1 += (int8)(bias_value);
302 #endif /* CONV_STRIDE_Y & &DILATION_Y == 1 */
303 #endif //defined(HAS_BIAS)
304 
305 #if WEIGHTS_OFFSET != 0
306  values0 += sum0 * (int8)(WEIGHTS_OFFSET);
307 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
308  values1 += sum1 * (int8)(WEIGHTS_OFFSET);
309 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
310 #endif /* WEIGHTS_OFFSET != 0 */
311 
312 #if INPUT_OFFSET != 0
313  VEC_WEIGHTS_PROMOTED_TYPE(3)
314  tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
315 
316  WEIGHTS_PROMOTED_TYPE sum_weights = tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
317  values0 += sum_weights * (int8)(INPUT_OFFSET);
318 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
319  values1 += sum_weights * (int8)(INPUT_OFFSET);
320 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
321 #endif /* INPUT_OFFSET != 0 */
322 
323 #if K_OFFSET != 0
324  values0 += (int8)(K_OFFSET);
325 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
326  values1 += (int8)(K_OFFSET);
327 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
328 #endif /* K_OFFSET != 0 */
329 
330 #if defined(REAL_MULTIPLIER)
331 
332  values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
333 
334 #else // defined(REAL_MULTIPLIER)
335 
336 #if defined(PER_CHANNEL_QUANTIZATION)
337  int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
338  int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
339  values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
340 #else // defined(PER_CHANNEL_QUANTIZATION)
341 #if OUTPUT_SHIFT < 0
342  values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
343 #else // OUTPUT_SHIFT < 0
344  values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
345 #endif // OUTPUT_OFFSET < 0
346 #endif // defined(PER_CHANNEL_QUANTIZATION)
347 
348 #endif // defined(REAL_MULTIPLIER)
349 
350  values0 += (int8)OUTPUT_OFFSET;
351  VEC_TYPE(8)
352  res0 = CONVERT_SAT(values0, VEC_TYPE(8));
353 
354  vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
355 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
356 #if defined(REAL_MULTIPLIER)
357 
358  values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
359 
360 #else // defined(REAL_MULTIPLIER)
361 
362 #if defined(PER_CHANNEL_QUANTIZATION)
363  int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
364  int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
365  values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
366 #else // defined(PER_CHANNEL_QUANTIZATION)
367 #if OUTPUT_SHIFT < 0
368  values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
369 #else // OUTPUT_SHIFT < 0
370  values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
371 #endif // OUTPUT_OFFSET < 0
372 #endif // defined(PER_CHANNEL_QUANTIZATION)
373 
374 #endif // defined(REAL_MULTIPLIER)
375 
376  values1 += (int8)OUTPUT_OFFSET;
377  VEC_TYPE(8)
378  res1 = CONVERT_SAT(values1, VEC_TYPE(8));
379 
380  vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
381 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
382 }
383 
384 #else // !defined(IS_DOT8)
385 
386 #if DILATION_X == 1
387 #if CONV_STRIDE_X == 1
388 #define GET_VALUES(first_value, left, middle, right) \
389  ({ \
390  VEC_TYPE(8) \
391  temp0 = vload8(0, (__global DATA_TYPE *)(first_value)); \
392  VEC_TYPE(2) \
393  temp1 = vload2(0, (__global DATA_TYPE *)(first_value + 8 * sizeof(DATA_TYPE))); \
394  \
395  left = temp0.s01234567; \
396  middle = (VEC_TYPE(8))(temp0.s1234, temp0.s567, temp1.s0); \
397  right = (VEC_TYPE(8))(temp0.s2345, temp0.s67, temp1.s01); \
398  })
399 #elif CONV_STRIDE_X == 2
400 #define GET_VALUES(first_value, left, middle, right) \
401  ({ \
402  VEC_TYPE(16) \
403  temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \
404  DATA_TYPE temp1 = *((__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
405  \
406  left = temp0.s02468ace; \
407  middle = temp0.s13579bdf; \
408  right = (VEC_TYPE(8))(temp0.s2468, temp0.sace, temp1); \
409  })
410 #else /* CONV_STRIDE_X */
411 #define GET_VALUES(first_value, left, middle, right) \
412  ({ \
413  VEC_TYPE(16) \
414  temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \
415  VEC_TYPE(8) \
416  temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
417  \
418  left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \
419  middle = (VEC_TYPE(8))(temp0.s147a, temp0.sd, temp1.s036); \
420  right = (VEC_TYPE(8))(temp0.s258b, temp0.se, temp1.s147); \
421  })
422 #endif /* CONV_STRIDE_X */
423 #else /*DILATION_X==1*/
424 
425 #if CONV_STRIDE_X == 1
426 #define GET_VALUES(first_value, left, middle, right) \
427  ({ \
428  left = vload8(0, (__global DATA_TYPE *)(first_value)); \
429  middle = vload8(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \
430  right = vload8(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
431  })
432 #elif CONV_STRIDE_X == 2
433 #define GET_VALUES(first_value, left, middle, right) \
434  ({ \
435  VEC_TYPE(16) \
436  temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \
437  left = temp0.s02468ace; \
438  temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \
439  middle = temp0.s02468ace; \
440  temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
441  right = temp0.s02468ace; \
442  })
443 #else /* CONV_STRIDE_X */
444 #define GET_VALUES(first_value, left, middle, right) \
445  ({ \
446  VEC_TYPE(16) \
447  temp0 = vload16(0, (__global DATA_TYPE *)(first_value)); \
448  VEC_TYPE(8) \
449  temp1 = vload8(0, (__global DATA_TYPE *)(first_value + 16 * sizeof(DATA_TYPE))); \
450  left = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \
451  \
452  temp0 = vload16(0, (__global DATA_TYPE *)(first_value + DILATION_X * sizeof(DATA_TYPE))); \
453  temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + DILATION_X) * sizeof(DATA_TYPE))); \
454  middle = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \
455  \
456  temp0 = vload16(0, (__global DATA_TYPE *)(first_value + 2 * DILATION_X * sizeof(DATA_TYPE))); \
457  temp1 = vload8(0, (__global DATA_TYPE *)(first_value + (16 + 2 * DILATION_X) * sizeof(DATA_TYPE))); \
458  right = (VEC_TYPE(8))(temp0.s0369, temp0.scf, temp1.s25); \
459  })
460 
461 #endif /* CONV_STRIDE_X */
462 #endif /*DILATION_X==1*/
463 /** This function computes the depthwise convolution quantized using dot product when the data layout is NCHW.
464  *
465  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
466  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
467  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
468  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
469  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
470  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
471  * @param[in] src_step_z src_stride_z * number of elements along Y processed per workitem(in bytes)
472  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
473  * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
474  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
475  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
476  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
477  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
478  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
479  * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
480  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
481  * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
482  * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
483  * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
484  * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
485  * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
486  * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
487  * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
488  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
489  * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32
490  * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)
491  * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
492  * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
493  * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32
494  * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)
495  * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
496  * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector
497  * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32
498  * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
499  * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
500  * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
501  */
502 
503 __kernel void dwc_3x3_native_quantized8_dot8_nchw(
506  TENSOR3D_DECLARATION(weights),
507  VECTOR_DECLARATION(output_multipliers),
508  VECTOR_DECLARATION(output_shifts)
509 #if defined(HAS_BIAS)
510  ,
511  VECTOR_DECLARATION(biases)
512 #endif //defined(HAS_BIAS)
513 )
514 {
515  __global uchar *src_addr = src_ptr + get_global_id(0) * src_step_x + get_global_id(1) * src_step_y + get_global_id(2) * src_step_z;
517  Tensor3D weights = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(weights);
518  Vector output_multipliers = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_multipliers);
519  Vector output_shifts = CONVERT_TO_VECTOR_STRUCT_NO_STEP(output_shifts);
520 
521  // Extract channel and linearized batch indices
522  const int channel = get_global_id(2) % DST_CHANNELS;
523  const int batch = get_global_id(2) / DST_CHANNELS;
524 
525 #if defined(HAS_BIAS)
526  Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases);
527 
528  const int bias_value = *((__global int *)(vector_offset(&biases, channel)));
529 #endif //defined(HAS_BIAS)
530 
531  // Load relevant input and weights data (Accounts depth multiplier when indexing input, OFM = IFM * DEPTH_MULTIPLIER)
532  src_addr -= batch * (DST_CHANNELS / DEPTH_MULTIPLIER) * (DEPTH_MULTIPLIER - 1) * src_step_z + (channel - (channel / DEPTH_MULTIPLIER)) * src_step_z;
533  __global uchar *weights_addr = weights.ptr + get_global_id(0) * weights_step_x + get_global_id(1) * weights_step_y + channel * weights_step_z;
534 
535  VEC_TYPE(3)
536  w0 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 0 * weights_stride_y));
537  VEC_TYPE(3)
538  w1 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 1 * weights_stride_y));
539  VEC_TYPE(3)
540  w2 = vload3(0, (__global WEIGHTS_TYPE *)(weights_addr + 2 * weights_stride_y));
541 
542  const int output_multiplier = *((__global int *)vector_offset(&output_multipliers, 0));
543  const int output_shift = *((__global int *)vector_offset(&output_shifts, 0));
544 
545  VEC_TYPE(8)
546  left0, middle0, right0;
547  VEC_TYPE(8)
548  left1, middle1, right1;
549  VEC_TYPE(8)
550  left2, middle2, right2;
551 
552  int8 values0 = 0;
553  int8 sum0 = 0;
554 
555  GET_VALUES(src_addr + 0 * src_stride_y, left0, middle0, right0);
556  GET_VALUES(src_addr + DILATION_Y * src_stride_y, left1, middle1, right1);
557  GET_VALUES(src_addr + 2 * DILATION_Y * src_stride_y, left2, middle2, right2);
558 
559 #if WEIGHTS_OFFSET != 0
560  sum0 += convert_int8(left0) + convert_int8(middle0) + convert_int8(right0);
561  sum0 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
562  sum0 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
563 #endif /* WEIGHTS_OFFSET != 0 */
564 
565 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
566  // If conv_stride_y is equals to 1, we compute two output rows
567 
568  VEC_TYPE(8)
569  left3, middle3, right3;
570  int8 values1 = 0;
571  int8 sum1 = 0;
572 
573  GET_VALUES(src_addr + 3 * src_stride_y, left3, middle3, right3);
574 
575 #if WEIGHTS_OFFSET != 0
576  sum1 += convert_int8(left1) + convert_int8(middle1) + convert_int8(right1);
577  sum1 += convert_int8(left2) + convert_int8(middle2) + convert_int8(right2);
578  sum1 += convert_int8(left3) + convert_int8(middle3) + convert_int8(right3);
579 #endif /* WEIGHTS_OFFSET != 0 */
580 #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
581 
582  ARM_DOT((VEC_TYPE(4))(left0.s0, middle0.s0, right0.s0, left1.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s0);
583  ARM_DOT((VEC_TYPE(4))(middle1.s0, right1.s0, left2.s0, middle2.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s0);
584  values0.s0 += right2.s0 * w2.s2;
585 
586  ARM_DOT((VEC_TYPE(4))(left0.s1, middle0.s1, right0.s1, left1.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s1);
587  ARM_DOT((VEC_TYPE(4))(middle1.s1, right1.s1, left2.s1, middle2.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s1);
588  values0.s1 += right2.s1 * w2.s2;
589 
590  ARM_DOT((VEC_TYPE(4))(left0.s2, middle0.s2, right0.s2, left1.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s2);
591  ARM_DOT((VEC_TYPE(4))(middle1.s2, right1.s2, left2.s2, middle2.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s2);
592  values0.s2 += right2.s2 * w2.s2;
593 
594  ARM_DOT((VEC_TYPE(4))(left0.s3, middle0.s3, right0.s3, left1.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s3);
595  ARM_DOT((VEC_TYPE(4))(middle1.s3, right1.s3, left2.s3, middle2.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s3);
596  values0.s3 += right2.s3 * w2.s2;
597 
598  ARM_DOT((VEC_TYPE(4))(left0.s4, middle0.s4, right0.s4, left1.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s4);
599  ARM_DOT((VEC_TYPE(4))(middle1.s4, right1.s4, left2.s4, middle2.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s4);
600  values0.s4 += right2.s4 * w2.s2;
601 
602  ARM_DOT((VEC_TYPE(4))(left0.s5, middle0.s5, right0.s5, left1.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s5);
603  ARM_DOT((VEC_TYPE(4))(middle1.s5, right1.s5, left2.s5, middle2.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s5);
604  values0.s5 += right2.s5 * w2.s2;
605 
606  ARM_DOT((VEC_TYPE(4))(left0.s6, middle0.s6, right0.s6, left1.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s6);
607  ARM_DOT((VEC_TYPE(4))(middle1.s6, right1.s6, left2.s6, middle2.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s6);
608  values0.s6 += right2.s6 * w2.s2;
609 
610  ARM_DOT((VEC_TYPE(4))(left0.s7, middle0.s7, right0.s7, left1.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values0.s7);
611  ARM_DOT((VEC_TYPE(4))(middle1.s7, right1.s7, left2.s7, middle2.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values0.s7);
612  values0.s7 += right2.s7 * w2.s2;
613 
614 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
615  ARM_DOT((VEC_TYPE(4))(left1.s0, middle1.s0, right1.s0, left2.s0), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s0);
616  ARM_DOT((VEC_TYPE(4))(middle2.s0, right2.s0, left3.s0, middle3.s0), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s0);
617  values1.s0 += right3.s0 * w2.s2;
618 
619  ARM_DOT((VEC_TYPE(4))(left1.s1, middle1.s1, right1.s1, left2.s1), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s1);
620  ARM_DOT((VEC_TYPE(4))(middle2.s1, right2.s1, left3.s1, middle3.s1), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s1);
621  values1.s1 += right3.s1 * w2.s2;
622 
623  ARM_DOT((VEC_TYPE(4))(left1.s2, middle1.s2, right1.s2, left2.s2), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s2);
624  ARM_DOT((VEC_TYPE(4))(middle2.s2, right2.s2, left3.s2, middle3.s2), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s2);
625  values1.s2 += right3.s2 * w2.s2;
626 
627  ARM_DOT((VEC_TYPE(4))(left1.s3, middle1.s3, right1.s3, left2.s3), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s3);
628  ARM_DOT((VEC_TYPE(4))(middle2.s3, right2.s3, left3.s3, middle3.s3), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s3);
629  values1.s3 += right3.s3 * w2.s2;
630 
631  ARM_DOT((VEC_TYPE(4))(left1.s4, middle1.s4, right1.s4, left2.s4), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s4);
632  ARM_DOT((VEC_TYPE(4))(middle2.s4, right2.s4, left3.s4, middle3.s4), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s4);
633  values1.s4 += right3.s4 * w2.s2;
634 
635  ARM_DOT((VEC_TYPE(4))(left1.s5, middle1.s5, right1.s5, left2.s5), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s5);
636  ARM_DOT((VEC_TYPE(4))(middle2.s5, right2.s5, left3.s5, middle3.s5), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s5);
637  values1.s5 += right3.s5 * w2.s2;
638 
639  ARM_DOT((VEC_TYPE(4))(left1.s6, middle1.s6, right1.s6, left2.s6), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s6);
640  ARM_DOT((VEC_TYPE(4))(middle2.s6, right2.s6, left3.s6, middle3.s6), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s6);
641  values1.s6 += right3.s6 * w2.s2;
642 
643  ARM_DOT((VEC_TYPE(4))(left1.s7, middle1.s7, right1.s7, left2.s7), (VEC_TYPE(4))(w0.s0, w0.s1, w0.s2, w1.s0), values1.s7);
644  ARM_DOT((VEC_TYPE(4))(middle2.s7, right2.s7, left3.s7, middle3.s7), (VEC_TYPE(4))(w1.s1, w1.s2, w2.s0, w2.s1), values1.s7);
645  values1.s7 += right3.s7 * w2.s2;
646 #endif // CONV_STRIDE_Y == 1 && DILATION_Y==1
647 
648 #if defined(HAS_BIAS)
649  values0 += (int8)(bias_value);
650 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
651  values1 += (int8)(bias_value);
652 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
653 #endif //defined(HAS_BIAS)
654 
655 #if WEIGHTS_OFFSET != 0
656  values0 += sum0 * (int8)(WEIGHTS_OFFSET);
657 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
658  values1 += sum1 * (int8)(WEIGHTS_OFFSET);
659 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1 */
660 #endif /* WEIGHTS_OFFSET != 0 */
661 
662 #if INPUT_OFFSET != 0
663  WEIGHTS_PROMOTED_TYPE sum_weights = 0;
664  VEC_WEIGHTS_PROMOTED_TYPE(3)
665  tmp_we = CONVERT(w0, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w1, VEC_WEIGHTS_PROMOTED_TYPE(3)) + CONVERT(w2, VEC_WEIGHTS_PROMOTED_TYPE(3));
666  sum_weights += tmp_we.s0 + tmp_we.s1 + tmp_we.s2;
667  values0 += sum_weights * (int8)(INPUT_OFFSET);
668 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
669  values1 += sum_weights * (int8)(INPUT_OFFSET);
670 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
671 #endif /* INPUT_OFFSET != 0 */
672 
673 #if K_OFFSET != 0
674  values0 += (int8)(K_OFFSET);
675 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
676  values1 += (int8)(K_OFFSET);
677 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
678 #endif /* K_OFFSET != 0 */
679 
680 #if defined(REAL_MULTIPLIER)
681 
682  values0 = CONVERT(round(CONVERT(values0, float8) * (float8)REAL_MULTIPLIER), int8);
683 
684 #else // defined(REAL_MULTIPLIER)
685 
686 #if defined(PER_CHANNEL_QUANTIZATION)
687  int8 res0_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, output_multiplier, output_shift, 8);
688  int8 res0_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, output_multiplier, output_shift, 8);
689  values0 = select(res0_shift_lt0, res0_shift_gt0, (int8)(output_shift) >= 0);
690 #else // defined(PER_CHANNEL_QUANTIZATION)
691 #if OUTPUT_SHIFT < 0
692  values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
693 #else // OUTPUT_SHIFT < 0
694  values0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values0, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
695 #endif // OUTPUT_OFFSET < 0
696 #endif // defined(PER_CHANNEL_QUANTIZATION)
697 
698 #endif // defined(REAL_MULTIPLIER)
699 
700  values0 += (int8)OUTPUT_OFFSET;
701  VEC_TYPE(8)
702  res0 = CONVERT_SAT(values0, VEC_TYPE(8));
703 
704  vstore8(ACTIVATION_FUNC(res0), 0, dst.ptr);
705 #if CONV_STRIDE_Y == 1 && DILATION_Y == 1
706 
707 #if defined(REAL_MULTIPLIER)
708 
709  values1 = CONVERT(round(CONVERT(values1, float8) * (float8)REAL_MULTIPLIER), int8);
710 
711 #else // defined(REAL_MULTIPLIER)
712 
713 #if defined(PER_CHANNEL_QUANTIZATION)
714  int8 res1_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, output_multiplier, output_shift, 8);
715  int8 res1_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, output_multiplier, output_shift, 8);
716  values1 = select(res1_shift_lt0, res1_shift_gt0, (int8)(output_shift) >= 0);
717 #else // defined(PER_CHANNEL_QUANTIZATION)
718 #if OUTPUT_SHIFT < 0
719  values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
720 #else // OUTPUT_SHIFT < 0
721  values1 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(values1, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, 8);
722 #endif // OUTPUT_OFFSET < 0
723 #endif // defined(PER_CHANNEL_QUANTIZATION)
724 
725 #endif // defined(REAL_MULTIPLIER)
726 
727  values1 += (int8)OUTPUT_OFFSET;
728  VEC_TYPE(8)
729  res1 = CONVERT_SAT(values1, VEC_TYPE(8));
730 
731  vstore8(ACTIVATION_FUNC(res1), 0, dst.ptr + dst_stride_y);
732 #endif /* CONV_STRIDE_Y == 1 && DILATION_Y==1*/
733 }
734 
735 #endif // !defined(IS_DOT8)
736 
737 #endif /* defined(CONV_STRIDE_Y) && defined(CONV_STRIDE_X) && defined(DEPTH_MULTIPLIER) && defined(DST_CHANNELS) */
738 
739 #if defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
740 
741 #define asymm_mult_by_quant_multiplier_less_than_one(x, y, z) ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, y, z, VEC_SIZE)
742 
743 #define MULTIPLY_ADD(x, y, acc) acc += CONVERT(CONVERT(x, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)) * CONVERT(y, VEC_WEIGHTS_PROMOTED_TYPE(VEC_SIZE)), VEC_INT)
744 
745 #if WEIGHTS_OFFSET != 0
746 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) \
747  ({ \
748  sum += CONVERT(x, VEC_INT); \
749  MULTIPLY_ADD(x, y, acc); \
750  })
751 #else /* WEIGHTS_OFFSET != 0 */
752 #define MULTIPLY_ADD_ACCUMULATE(x, y, acc, sum) MULTIPLY_ADD(x, y, acc)
753 #endif /* WEIGHTS_OFFSET != 0 */
754 
755 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
756 #define DOT_PRODUCT(acc, val0, val1, val2, val3, val4, val5, val6, val7, val8, w0, w1) \
757  ({ \
758  ARM_DOT((VEC_TYPE(4))(val0, val1, val2, val3), w0.s0123, acc); \
759  ARM_DOT((VEC_TYPE(4))(val4, val5, val6, val7), w0.s4567, acc); \
760  acc += val8 * w1; \
761  })
762 
763 #define DOT_PRODUCT_REDUCTION(sum, val0, val1, val2, val3, val4, val5, val6, val7, val8) \
764  ({ \
765  sum = val0; \
766  ARM_DOT((VEC_TYPE(4))(val1, val2, val3, val4), (VEC_TYPE(4))1, sum); \
767  ARM_DOT((VEC_TYPE(4))(val5, val6, val7, val8), (VEC_TYPE(4))1, sum); \
768  })
769 
770 #define DOT_PRODUCT_REDUCTION_WEIGHTS(sum, w0, w1) \
771  ({ \
772  sum = w1; \
773  ARM_DOT(w0.s0123, (VEC_TYPE(4))1, sum); \
774  ARM_DOT(w0.s4567, (VEC_TYPE(4))1, sum); \
775  })
776 
777 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
778 
779 #endif // defined(VEC_SIZE) && defined(SRC_DIM_1) && defined(SRC_DIM_2) && defined(CONV_PAD_TOP) && defined(CONV_PAD_LEFT)
780 
781 #endif // defined(WEIGHTS_PROMOTED_TYPE)
782 
783 #endif // defined(WEIGHTS_OFFSET) && defined(INPUT_OFFSET) && defined(K_OFFSET) && ((defined(OUTPUT_OFFSET) && defined(OUTPUT_MULTIPLIER) && defined(OUTPUT_SHIFT)) || defined(REAL_MULTIPLIER))
784 
785 #if defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
786 /** This function computes the depthwise convolution for NHWC data layout.
787  *
788  * @note The number of elements processed must be passed at compile time using -DN0 (e.g. -DN0=2)
789  * @note The depth multiplier must be passed at compile time using -DDEPTH_MULTIPLIER (e.g. -DDEPTH_MULTIPLIER=1)
790  * @note The first dimension of the input tensor must be passed at compile time using -DSRC_DIM1 (e.g. -DSRC_DIM1=112)
791  * @note The second dimension of the input tensor must be passed at compile time using -DSRC_DIM2 (e.g. -DSRC_DIM2=80)
792  * @note The kernel width must be passed at compile time using -DKERNEL_WIDTH (e.g. -DKERNEL_WIDTH=5)
793  * @note The kernel height must be passed at compile time using -DKERNEL_HEIGHT (e.g. -DKERNEL_HEIGHT=5)
794  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_TOP (e.g. -DCONV_PAD_TOP=1)
795  * @note The convolution pad top must be passed at compile time using -DCONV_PAD_LEFT (e.g. -DCONV_PAD_LEFT=1)
796  * @note The convolution stride along the width must be passed at compile time using -DCONV_STRIDE_X (e.g. -DCONV_STRIDE_Y=X)
797  * @note The convolution stride along the height must be passed at compile time using -DCONV_STRIDE_Y (e.g. -DCONV_STRIDE_Y=1)
798  * @note Leftover vector size has to be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
799  * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
800  * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
801  *
802  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
803  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
804  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
805  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
806  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
807  * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
808  * @param[in] src_step_z src_stride_y * number of elements along Z processed per workitem(in bytes)
809  * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
810  * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
811  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
812  * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
813  * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
814  * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
815  * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
816  * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
817  * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
818  * @param[in] dst_step_z dst_stride_z * number of elements along Y processed per workitem(in bytes)
819  * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
820  * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
821  * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
822  * @param[in] weights_ptr Pointer to the weights tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL
823  * @param[in] weights_stride_x Stride of the weights tensor in X dimension (in bytes)
824  * @param[in] weights_step_x weights_stride_x * number of elements along X processed per workitem(in bytes)
825  * @param[in] weights_stride_y Stride of the weights tensor in Y dimension (in bytes)
826  * @param[in] weights_step_y weights_stride_y * number of elements along Y processed per workitem(in bytes)
827  * @param[in] weights_stride_z Stride of the weights tensor in Z dimension (in bytes)
828  * @param[in] weights_step_z weights_stride_z * number of elements along Y processed per workitem(in bytes)
829  * @param[in] weights_offset_first_element_in_bytes The offset of the first element in the weights tensor
830  * @param[in] output_multipliers_ptr Pointer to the output multipliers vector. Supported data types: S32
831  * @param[in] output_multipliers_stride_x Stride of the output multipliers vector in X dimension (in bytes)
832  * @param[in] output_multipliers_step_x output_multipliers_stride_x * number of elements along X processed per workitem(in bytes)
833  * @param[in] output_multipliers_offset_first_element_in_bytes The offset of the first element in the output multipliers vector
834  * @param[in] output_shifts_ptr Pointer to the output shifts vector. Supported data types: S32
835  * @param[in] output_shifts_stride_x Stride of the output shifts vector in X dimension (in bytes)
836  * @param[in] output_shifts_step_x output_shifts_stride_x * number of elements along X processed per workitem(in bytes)
837  * @param[in] output_shifts_offset_first_element_in_bytes The offset of the first element in the output shifts vector
838  * @param[in] biases_ptr (Optional) Pointer to the biases vector. Supported data types: S32
839  * @param[in] biases_stride_x (Optional) Stride of the biases vector in X dimension (in bytes)
840  * @param[in] biases_step_x (Optional) biases_stride_x * number of elements along X processed per workitem(in bytes)
841  * @param[in] biases_offset_first_element_in_bytes (Optional) The offset of the first element in the biases vector
842  */
843 __kernel void dwc_MxN_native_quantized8_nhwc(
846  TENSOR3D_DECLARATION(weights),
847  VECTOR_DECLARATION(output_multipliers),
848  VECTOR_DECLARATION(output_shifts)
849 #if defined(HAS_BIAS)
850  ,
851  VECTOR_DECLARATION(biases)
852 #endif // defined(HAS_BIAS)
853 )
854 {
855  int x_offs = max((int)(get_global_id(0) * N0 - (N0 - VEC_SIZE_LEFTOVER) % N0), 0);
856  int y = get_global_id(1); // spatial coordinate x
857 #if defined(DST_DEPTH)
858  int z = get_global_id(2) % (int)DST_DEPTH; // spatial coordinate y
859  int b = get_global_id(2) / (int)DST_DEPTH; // batch
860 #else // defined(DST_DEPTH)
861  int z = get_global_id(2); // spatial coordinate y
862 #endif // defined(DST_DEPTH)
863 
864  __global uchar *s_addr = src_ptr + src_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
865 
866  __global uchar *d_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE) * (int)DEPTH_MULTIPLIER + y * dst_stride_y + z * dst_stride_z;
867 
868  __global uchar *w_addr = weights_ptr + weights_offset_first_element_in_bytes + x_offs * sizeof(WEIGHTS_TYPE) * (int)DEPTH_MULTIPLIER;
869 
870 #if defined(HAS_BIAS)
871  __global uchar *b_addr = biases_ptr + biases_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
872 #endif // defined(HAS_BIAS)
873 
874 #if defined(PER_CHANNEL_QUANTIZATION)
875  __global uchar *out_mul_addr = output_multipliers_ptr + output_multipliers_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
876  __global uchar *out_shift_addr = output_shifts_ptr + output_shifts_offset_first_element_in_bytes + x_offs * sizeof(int) * (int)DEPTH_MULTIPLIER;
877 #endif // defined(PER_CHANNEL_QUANTIZATION)
878 
879 #if defined(DST_DEPTH)
880  s_addr += b * src_stride_w;
881  d_addr += b * dst_stride_w;
882 #endif // defined(DST_DEPTH)
883 
884 #if DEPTH_MULTIPLIER > 1
885  for(int d = 0; d < (int)DEPTH_MULTIPLIER; ++d)
886  {
887 #endif // DEPTH_MULTIPLIER > 1
888  // Each work-item computes N0x1x1 elements
889  VEC_INT res = 0;
890 
891  int x_coord = y * CONV_STRIDE_X - (int)CONV_PAD_LEFT;
892  int y_coord = z * CONV_STRIDE_Y - (int)CONV_PAD_TOP;
893 
894  for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
895  {
896  if(y_coord >= 0 && y_coord < SRC_DIM2)
897  {
898  int x_coord_tmp = x_coord;
899 
900  for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
901  {
902  if(x_coord_tmp >= 0 && x_coord_tmp < SRC_DIM1)
903  {
904  int s_offset = x_coord_tmp * (int)src_stride_y + y_coord * (int)src_stride_z;
905  int w_offset = xk * weights_stride_y + yk * weights_stride_z;
906 
907  // Load input and weights values
908  VEC_INT i = CONVERT(VLOAD(N0)(0, (__global DATA_TYPE *)(s_addr + s_offset)), VEC_INT);
909  VEC_INT w = CONVERT(VLOAD(N0)(0, (__global WEIGHTS_TYPE *)(w_addr + w_offset)), VEC_INT);
910 
911  res += (i + (VEC_INT)INPUT_OFFSET) * (w + (VEC_INT)WEIGHTS_OFFSET);
912  }
913  x_coord_tmp += DILATION_X;
914  }
915  }
916  y_coord += DILATION_Y;
917  }
918 
919 #if defined(HAS_BIAS)
920  VEC_INT bias = VLOAD(N0)(0, (__global int *)(b_addr));
921  res += bias;
922 #endif // defined(HAS_BIAS)
923 
924 #if defined(PER_CHANNEL_QUANTIZATION)
925  VEC_INT output_multiplier = VLOAD(N0)(0, (__global int *)(out_mul_addr));
926  VEC_INT output_shift = VLOAD(N0)(0, (__global int *)(out_shift_addr));
927 
928  VEC_INT res_shift_lt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, output_multiplier, output_shift, N0);
929  VEC_INT res_shift_gt0 = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, output_multiplier, output_shift, N0);
930  res = select(res_shift_lt0, res_shift_gt0, (VEC_INT)(output_shift) >= 0);
931 #else // defined(PER_CHANNEL_QUANTIZATION)
932 #if OUTPUT_SHIFT < 0
933  res = ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
934 #else // OUTPUT_SHIFT < 0
935  res = ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(res, OUTPUT_MULTIPLIER, OUTPUT_SHIFT, N0);
936 #endif // OUTPUT_OFFSET < 0
937 #endif // defined(PER_CHANNEL_QUANTIZATION)
938 
939  res += (VEC_INT)OUTPUT_OFFSET;
940 
941  VEC_TYPE(VEC_SIZE)
942  res0 = CONVERT_SAT(res, VEC_TYPE(VEC_SIZE));
943  res0 = ACTIVATION_FUNC(res0);
944 
945  STORE_VECTOR_SELECT(res, DATA_TYPE, d_addr, N0, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
946 
947 #if DEPTH_MULTIPLIER > 1
948  w_addr += sizeof(WEIGHTS_TYPE);
949  d_addr += sizeof(DATA_TYPE);
950 #if defined(PER_CHANNEL_QUANTIZATION)
951  out_mul_addr += sizeof(int);
952  out_shift_addr += sizeof(int);
953 #endif // defined(PER_CHANNEL_QUANTIZATION)
954 #if defined(HAS_BIAS)
955  b_addr += sizeof(int);
956 #endif // defined(HAS_BIAS)
957  }
958 #endif // DEPTH_MULTIPLIER > 1
959 }
960 #endif // defined(SRC_DIM1) && defined(SRC_DIM2) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defiend(N0) && defined(DILATION_X) && defined(DILATION_Y) && defined(CONV_STRIDE_X) && defined(CONV_STRIDE_Y) && defined(CONV_PAD_LEFT) && defined(CONV_PAD_TOP) && defined(INPUT_OFFSET) && defined(WEIGHTS_OFFSET) && defined(OUTPUT_OFFSET) && defined(OUTPUT_SHIFT) && defined(OUTPUT_MULTIPLIER) && defined(VEC_SIZE_LEFTOVER)
961 #endif // defined(DATA_TYPE) && defined(WEIGHTS_TYPE)
Structure to hold Vector information.
Definition: helpers.h:666
const size_t weights_stride_z
SimpleTensor< float > w
Definition: DFT.cpp:156
#define CONVERT(x, type)
Definition: helpers.h:522
SimpleTensor< float > b
Definition: DFT.cpp:157
#define CONVERT_SAT(a, b)
Structure to hold 3D tensor information.
Definition: helpers.h:683
SimpleTensor< float > src
Definition: DFT.cpp:155
#define CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(name)
Definition: helpers.h:651
#define CONVERT_TENSOR3D_TO_IMAGE_STRUCT(name)
Definition: helpers.h:644
#define VECTOR_DECLARATION(name)
Definition: helpers.h:590
Structure to hold Image information.
Definition: helpers.h:674
int round(float x, RoundingPolicy rounding_policy)
Return a rounded value of x.
Definition: Rounding.cpp:35
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
Definition: Select.cpp:38
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
Definition: helpers.h:850
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_GREATER_THAN_ONE(x, quantized_multiplier, left_shift, size)
#define ACTIVATION_FUNC(x)
const size_t weights_stride_y
#define ASYMM_MULT_BY_QUANT_MULTIPLIER_LESS_THAN_ONE(x, quantized_multiplier, right_shift, size)
#define TENSOR4D_DECLARATION(name)
Definition: helpers.h:614
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:685
#define VLOAD(size)
Definition: helpers.h:203
#define TENSOR3D_DECLARATION(name)
Definition: helpers.h:604
#define CONVERT_TO_VECTOR_STRUCT_NO_STEP(name)
Definition: helpers.h:629
#define VEC_DATA_TYPE(type, size)
Definition: helpers.h:519