Compute Library
 22.11
tile_helpers.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2021-2022 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 // *INDENT-OFF*
26 // clang-format off
27 #ifndef ARM_COMPUTE_TILE_HELPERS_H
28 #define ARM_COMPUTE_TILE_HELPERS_H
29 
30 #define TILE_VECTOR_SIZE1 1
31 #define TILE_VECTOR_SIZE2 2
32 #define TILE_VECTOR_SIZE3 3
33 #define TILE_VECTOR_SIZE4 4
34 #define TILE_VECTOR_SIZE5 8
35 #define TILE_VECTOR_SIZE6 8
36 #define TILE_VECTOR_SIZE7 8
37 #define TILE_VECTOR_SIZE8 8
38 #define TILE_VECTOR_SIZE9 16
39 #define TILE_VECTOR_SIZE10 16
40 #define TILE_VECTOR_SIZE11 16
41 #define TILE_VECTOR_SIZE12 16
42 #define TILE_VECTOR_SIZE13 16
43 #define TILE_VECTOR_SIZE14 16
44 #define TILE_VECTOR_SIZE15 16
45 #define TILE_VECTOR_SIZE16 16
46 
47 #define TILE_VECTOR_TYPE1(DATA_TYPE) DATA_TYPE##1
48 #define TILE_VECTOR_TYPE2(DATA_TYPE) DATA_TYPE##2
49 #define TILE_VECTOR_TYPE3(DATA_TYPE) DATA_TYPE##3
50 #define TILE_VECTOR_TYPE4(DATA_TYPE) DATA_TYPE##4
51 #define TILE_VECTOR_TYPE5(DATA_TYPE) DATA_TYPE##8
52 #define TILE_VECTOR_TYPE6(DATA_TYPE) DATA_TYPE##8
53 #define TILE_VECTOR_TYPE7(DATA_TYPE) DATA_TYPE##8
54 #define TILE_VECTOR_TYPE8(DATA_TYPE) DATA_TYPE##8
55 #define TILE_VECTOR_TYPE9(DATA_TYPE) DATA_TYPE##16
56 #define TILE_VECTOR_TYPE10(DATA_TYPE) DATA_TYPE##16
57 #define TILE_VECTOR_TYPE11(DATA_TYPE) DATA_TYPE##16
58 #define TILE_VECTOR_TYPE12(DATA_TYPE) DATA_TYPE##16
59 #define TILE_VECTOR_TYPE13(DATA_TYPE) DATA_TYPE##16
60 #define TILE_VECTOR_TYPE14(DATA_TYPE) DATA_TYPE##16
61 #define TILE_VECTOR_TYPE15(DATA_TYPE) DATA_TYPE##16
62 #define TILE_VECTOR_TYPE16(DATA_TYPE) DATA_TYPE##16
63 
64 /** Tile object
65  * A tile object is a 2D memory block and can be accessed using the following syntax:
66  * -# a[m0].v = access the the vector at row "m0" (OpenCL vector)
67  * -# a[m0].s[x] = access the scalar element at row "m0" and column "n0" (scalar access)
68  *
69  * @param[in] DATA_TYPE Data type of the tile
70  * @param[in] H Number of tile rows
71  * @param[in] W Number of tile colums
72  * @param[in] BASENAME Tile's name
73  */
74 #define TILE(DATA_TYPE, H, W, BASENAME) TILE_STR(DATA_TYPE, H, W, BASENAME)
75 #define TILE_STR(DATA_TYPE, H, W, BASENAME) \
76  union { \
77  DATA_TYPE s[TILE_VECTOR_SIZE##W]; \
78  TILE_VECTOR_TYPE##W(DATA_TYPE) v; \
79  } BASENAME[H]
80 
81 #define TENSOR4D_IMAGE(name) \
82  __read_only image2d_t name##_img, \
83  __global uchar *name##_ptr, \
84  uint name##_stride_x, \
85  uint name##_step_x, \
86  uint name##_stride_y, \
87  uint name##_step_y, \
88  uint name##_stride_z, \
89  uint name##_step_z, \
90  uint name##_stride_w, \
91  uint name##_step_w, \
92  uint name##_offset_first_element_in_bytes
93 
94 #define TENSOR4D_BUFFER(name) \
95  __global uchar *name##_ptr, \
96  uint name##_stride_x, \
97  uint name##_step_x, \
98  uint name##_stride_y, \
99  uint name##_step_y, \
100  uint name##_stride_z, \
101  uint name##_step_z, \
102  uint name##_stride_w, \
103  uint name##_step_w, \
104  uint name##_offset_first_element_in_bytes
105 
106 #define TENSOR4D_STR(name, type) TENSOR4D_##type(name)
107 #define TENSOR4D(name, type) TENSOR4D_STR(name, type)
108 
109 #define TENSOR4D_T_IMAGE(name) \
110  __read_only image2d_t name##_img, \
111  __global uchar *name##_ptr, \
112  uint name##_stride_y, \
113  uint name##_stride_z, \
114  uint name##_stride_w, \
115  uint name##_c, \
116  uint name##_w, \
117  uint name##_h, \
118  uint name##_n, \
119  uint name##_offset_first_element_in_bytes
120 
121 #define TENSOR4D_T_BUFFER(name) \
122  __global uchar *name##_ptr, \
123  uint name##_stride_y, \
124  uint name##_stride_z, \
125  uint name##_stride_w, \
126  uint name##_c, \
127  uint name##_w, \
128  uint name##_h, \
129  uint name##_n, \
130  uint name##_offset_first_element_in_bytes
131 
132 #define TENSOR4D_T_STR(name, type) TENSOR4D_T_##type(name)
133 #define TENSOR4D_T(name, type) TENSOR4D_T_STR(name, type)
134 
135 #define TENSOR3D_T_IMAGE(name) \
136  __read_only image2d_t name##_img, \
137  __global uchar *name##_ptr, \
138  uint name##_stride_y, \
139  uint name##_stride_z, \
140  uint name##_w, \
141  uint name##_h, \
142  uint name##_n, \
143  uint name##_offset_first_element_in_bytes
144 
145 #define TENSOR3D_T_BUFFER(name) \
146  __global uchar *name##_ptr, \
147  uint name##_stride_y, \
148  uint name##_stride_z, \
149  uint name##_w, \
150  uint name##_h, \
151  uint name##_n, \
152  uint name##_offset_first_element_in_bytes
153 
154 #define TENSOR3D_T_STR(name, type) TENSOR3D_T_##type(name)
155 #define TENSOR3D_T(name, type) TENSOR3D_T_STR(name, type)
156 
157 #if !defined(UNROLL_WITH_PRAGMA)
158 #define UNROLL_INCR(idx, step, macro) idx += (step); (macro)
159 
160 #define LOOP_UNROLLING_1(idx, step, macro) (macro)
161 #define LOOP_UNROLLING_2(idx, step, macro) LOOP_UNROLLING_1(idx, step, macro); UNROLL_INCR(idx, step, macro)
162 #define LOOP_UNROLLING_3(idx, step, macro) LOOP_UNROLLING_2(idx, step, macro); UNROLL_INCR(idx, step, macro)
163 #define LOOP_UNROLLING_4(idx, step, macro) LOOP_UNROLLING_3(idx, step, macro); UNROLL_INCR(idx, step, macro)
164 #define LOOP_UNROLLING_5(idx, step, macro) LOOP_UNROLLING_4(idx, step, macro); UNROLL_INCR(idx, step, macro)
165 #define LOOP_UNROLLING_6(idx, step, macro) LOOP_UNROLLING_5(idx, step, macro); UNROLL_INCR(idx, step, macro)
166 #define LOOP_UNROLLING_7(idx, step, macro) LOOP_UNROLLING_6(idx, step, macro); UNROLL_INCR(idx, step, macro)
167 #define LOOP_UNROLLING_8(idx, step, macro) LOOP_UNROLLING_7(idx, step, macro); UNROLL_INCR(idx, step, macro)
168 #define LOOP_UNROLLING_9(idx, step, macro) LOOP_UNROLLING_8(idx, step, macro); UNROLL_INCR(idx, step, macro)
169 #define LOOP_UNROLLING_10(idx, step, macro) LOOP_UNROLLING_9(idx, step, macro); UNROLL_INCR(idx, step, macro)
170 #define LOOP_UNROLLING_11(idx, step, macro) LOOP_UNROLLING_10(idx, step, macro); UNROLL_INCR(idx, step, macro)
171 #define LOOP_UNROLLING_12(idx, step, macro) LOOP_UNROLLING_11(idx, step, macro); UNROLL_INCR(idx, step, macro)
172 #define LOOP_UNROLLING_13(idx, step, macro) LOOP_UNROLLING_12(idx, step, macro); UNROLL_INCR(idx, step, macro)
173 #define LOOP_UNROLLING_14(idx, step, macro) LOOP_UNROLLING_13(idx, step, macro); UNROLL_INCR(idx, step, macro)
174 #define LOOP_UNROLLING_15(idx, step, macro) LOOP_UNROLLING_14(idx, step, macro); UNROLL_INCR(idx, step, macro)
175 #define LOOP_UNROLLING_16(idx, step, macro) LOOP_UNROLLING_15(idx, step, macro); UNROLL_INCR(idx, step, macro)
176 #define LOOP_UNROLLING_17(idx, step, macro) LOOP_UNROLLING_16(idx, step, macro); UNROLL_INCR(idx, step, macro)
177 #define LOOP_UNROLLING_18(idx, step, macro) LOOP_UNROLLING_17(idx, step, macro); UNROLL_INCR(idx, step, macro)
178 #define LOOP_UNROLLING_19(idx, step, macro) LOOP_UNROLLING_18(idx, step, macro); UNROLL_INCR(idx, step, macro)
179 #define LOOP_UNROLLING_20(idx, step, macro) LOOP_UNROLLING_19(idx, step, macro); UNROLL_INCR(idx, step, macro)
180 #define LOOP_UNROLLING_21(idx, step, macro) LOOP_UNROLLING_20(idx, step, macro); UNROLL_INCR(idx, step, macro)
181 #define LOOP_UNROLLING_22(idx, step, macro) LOOP_UNROLLING_21(idx, step, macro); UNROLL_INCR(idx, step, macro)
182 #define LOOP_UNROLLING_23(idx, step, macro) LOOP_UNROLLING_22(idx, step, macro); UNROLL_INCR(idx, step, macro)
183 #define LOOP_UNROLLING_24(idx, step, macro) LOOP_UNROLLING_23(idx, step, macro); UNROLL_INCR(idx, step, macro)
184 #define LOOP_UNROLLING_25(idx, step, macro) LOOP_UNROLLING_24(idx, step, macro); UNROLL_INCR(idx, step, macro)
185 #define LOOP_UNROLLING_26(idx, step, macro) LOOP_UNROLLING_25(idx, step, macro); UNROLL_INCR(idx, step, macro)
186 #define LOOP_UNROLLING_27(idx, step, macro) LOOP_UNROLLING_26(idx, step, macro); UNROLL_INCR(idx, step, macro)
187 #define LOOP_UNROLLING_28(idx, step, macro) LOOP_UNROLLING_27(idx, step, macro); UNROLL_INCR(idx, step, macro)
188 #define LOOP_UNROLLING_29(idx, step, macro) LOOP_UNROLLING_28(idx, step, macro); UNROLL_INCR(idx, step, macro)
189 #define LOOP_UNROLLING_30(idx, step, macro) LOOP_UNROLLING_29(idx, step, macro); UNROLL_INCR(idx, step, macro)
190 #define LOOP_UNROLLING_31(idx, step, macro) LOOP_UNROLLING_30(idx, step, macro); UNROLL_INCR(idx, step, macro)
191 #define LOOP_UNROLLING_32(idx, step, macro) LOOP_UNROLLING_31(idx, step, macro); UNROLL_INCR(idx, step, macro)
192 #define LOOP_UNROLLING_33(idx, step, macro) LOOP_UNROLLING_32(idx, step, macro); UNROLL_INCR(idx, step, macro)
193 #define LOOP_UNROLLING_34(idx, step, macro) LOOP_UNROLLING_33(idx, step, macro); UNROLL_INCR(idx, step, macro)
194 #define LOOP_UNROLLING_35(idx, step, macro) LOOP_UNROLLING_34(idx, step, macro); UNROLL_INCR(idx, step, macro)
195 #define LOOP_UNROLLING_36(idx, step, macro) LOOP_UNROLLING_35(idx, step, macro); UNROLL_INCR(idx, step, macro)
196 #define LOOP_UNROLLING_37(idx, step, macro) LOOP_UNROLLING_36(idx, step, macro); UNROLL_INCR(idx, step, macro)
197 #define LOOP_UNROLLING_38(idx, step, macro) LOOP_UNROLLING_37(idx, step, macro); UNROLL_INCR(idx, step, macro)
198 #define LOOP_UNROLLING_39(idx, step, macro) LOOP_UNROLLING_38(idx, step, macro); UNROLL_INCR(idx, step, macro)
199 #define LOOP_UNROLLING_40(idx, step, macro) LOOP_UNROLLING_39(idx, step, macro); UNROLL_INCR(idx, step, macro)
200 #define LOOP_UNROLLING_41(idx, step, macro) LOOP_UNROLLING_40(idx, step, macro); UNROLL_INCR(idx, step, macro)
201 #define LOOP_UNROLLING_42(idx, step, macro) LOOP_UNROLLING_41(idx, step, macro); UNROLL_INCR(idx, step, macro)
202 #define LOOP_UNROLLING_43(idx, step, macro) LOOP_UNROLLING_42(idx, step, macro); UNROLL_INCR(idx, step, macro)
203 #define LOOP_UNROLLING_44(idx, step, macro) LOOP_UNROLLING_43(idx, step, macro); UNROLL_INCR(idx, step, macro)
204 #define LOOP_UNROLLING_45(idx, step, macro) LOOP_UNROLLING_44(idx, step, macro); UNROLL_INCR(idx, step, macro)
205 #define LOOP_UNROLLING_46(idx, step, macro) LOOP_UNROLLING_45(idx, step, macro); UNROLL_INCR(idx, step, macro)
206 #define LOOP_UNROLLING_47(idx, step, macro) LOOP_UNROLLING_46(idx, step, macro); UNROLL_INCR(idx, step, macro)
207 #define LOOP_UNROLLING_48(idx, step, macro) LOOP_UNROLLING_47(idx, step, macro); UNROLL_INCR(idx, step, macro)
208 #define LOOP_UNROLLING_49(idx, step, macro) LOOP_UNROLLING_48(idx, step, macro); UNROLL_INCR(idx, step, macro)
209 #define LOOP_UNROLLING_50(idx, step, macro) LOOP_UNROLLING_49(idx, step, macro); UNROLL_INCR(idx, step, macro)
210 #define LOOP_UNROLLING_51(idx, step, macro) LOOP_UNROLLING_50(idx, step, macro); UNROLL_INCR(idx, step, macro)
211 #define LOOP_UNROLLING_52(idx, step, macro) LOOP_UNROLLING_51(idx, step, macro); UNROLL_INCR(idx, step, macro)
212 #define LOOP_UNROLLING_53(idx, step, macro) LOOP_UNROLLING_52(idx, step, macro); UNROLL_INCR(idx, step, macro)
213 #define LOOP_UNROLLING_54(idx, step, macro) LOOP_UNROLLING_53(idx, step, macro); UNROLL_INCR(idx, step, macro)
214 #define LOOP_UNROLLING_55(idx, step, macro) LOOP_UNROLLING_54(idx, step, macro); UNROLL_INCR(idx, step, macro)
215 #define LOOP_UNROLLING_56(idx, step, macro) LOOP_UNROLLING_55(idx, step, macro); UNROLL_INCR(idx, step, macro)
216 #define LOOP_UNROLLING_57(idx, step, macro) LOOP_UNROLLING_56(idx, step, macro); UNROLL_INCR(idx, step, macro)
217 #define LOOP_UNROLLING_58(idx, step, macro) LOOP_UNROLLING_57(idx, step, macro); UNROLL_INCR(idx, step, macro)
218 #define LOOP_UNROLLING_59(idx, step, macro) LOOP_UNROLLING_58(idx, step, macro); UNROLL_INCR(idx, step, macro)
219 #define LOOP_UNROLLING_60(idx, step, macro) LOOP_UNROLLING_59(idx, step, macro); UNROLL_INCR(idx, step, macro)
220 #define LOOP_UNROLLING_61(idx, step, macro) LOOP_UNROLLING_60(idx, step, macro); UNROLL_INCR(idx, step, macro)
221 #define LOOP_UNROLLING_62(idx, step, macro) LOOP_UNROLLING_61(idx, step, macro); UNROLL_INCR(idx, step, macro)
222 #define LOOP_UNROLLING_63(idx, step, macro) LOOP_UNROLLING_62(idx, step, macro); UNROLL_INCR(idx, step, macro)
223 #define LOOP_UNROLLING_64(idx, step, macro) LOOP_UNROLLING_63(idx, step, macro); UNROLL_INCR(idx, step, macro)
224 #define LOOP_UNROLLING_65(idx, step, macro) LOOP_UNROLLING_64(idx, step, macro); UNROLL_INCR(idx, step, macro)
225 #define LOOP_UNROLLING_66(idx, step, macro) LOOP_UNROLLING_65(idx, step, macro); UNROLL_INCR(idx, step, macro)
226 #define LOOP_UNROLLING_67(idx, step, macro) LOOP_UNROLLING_66(idx, step, macro); UNROLL_INCR(idx, step, macro)
227 #define LOOP_UNROLLING_68(idx, step, macro) LOOP_UNROLLING_67(idx, step, macro); UNROLL_INCR(idx, step, macro)
228 #define LOOP_UNROLLING_69(idx, step, macro) LOOP_UNROLLING_68(idx, step, macro); UNROLL_INCR(idx, step, macro)
229 #define LOOP_UNROLLING_70(idx, step, macro) LOOP_UNROLLING_69(idx, step, macro); UNROLL_INCR(idx, step, macro)
230 #define LOOP_UNROLLING_71(idx, step, macro) LOOP_UNROLLING_70(idx, step, macro); UNROLL_INCR(idx, step, macro)
231 #define LOOP_UNROLLING_72(idx, step, macro) LOOP_UNROLLING_71(idx, step, macro); UNROLL_INCR(idx, step, macro)
232 #define LOOP_UNROLLING_73(idx, step, macro) LOOP_UNROLLING_72(idx, step, macro); UNROLL_INCR(idx, step, macro)
233 #define LOOP_UNROLLING_74(idx, step, macro) LOOP_UNROLLING_73(idx, step, macro); UNROLL_INCR(idx, step, macro)
234 #define LOOP_UNROLLING_75(idx, step, macro) LOOP_UNROLLING_74(idx, step, macro); UNROLL_INCR(idx, step, macro)
235 #define LOOP_UNROLLING_76(idx, step, macro) LOOP_UNROLLING_75(idx, step, macro); UNROLL_INCR(idx, step, macro)
236 #define LOOP_UNROLLING_77(idx, step, macro) LOOP_UNROLLING_76(idx, step, macro); UNROLL_INCR(idx, step, macro)
237 #define LOOP_UNROLLING_78(idx, step, macro) LOOP_UNROLLING_77(idx, step, macro); UNROLL_INCR(idx, step, macro)
238 #define LOOP_UNROLLING_79(idx, step, macro) LOOP_UNROLLING_78(idx, step, macro); UNROLL_INCR(idx, step, macro)
239 #define LOOP_UNROLLING_80(idx, step, macro) LOOP_UNROLLING_79(idx, step, macro); UNROLL_INCR(idx, step, macro)
240 #define LOOP_UNROLLING_81(idx, step, macro) LOOP_UNROLLING_80(idx, step, macro); UNROLL_INCR(idx, step, macro)
241 #define LOOP_UNROLLING_82(idx, step, macro) LOOP_UNROLLING_81(idx, step, macro); UNROLL_INCR(idx, step, macro)
242 #define LOOP_UNROLLING_83(idx, step, macro) LOOP_UNROLLING_82(idx, step, macro); UNROLL_INCR(idx, step, macro)
243 #define LOOP_UNROLLING_84(idx, step, macro) LOOP_UNROLLING_83(idx, step, macro); UNROLL_INCR(idx, step, macro)
244 #define LOOP_UNROLLING_85(idx, step, macro) LOOP_UNROLLING_84(idx, step, macro); UNROLL_INCR(idx, step, macro)
245 #define LOOP_UNROLLING_86(idx, step, macro) LOOP_UNROLLING_85(idx, step, macro); UNROLL_INCR(idx, step, macro)
246 #define LOOP_UNROLLING_87(idx, step, macro) LOOP_UNROLLING_86(idx, step, macro); UNROLL_INCR(idx, step, macro)
247 #define LOOP_UNROLLING_88(idx, step, macro) LOOP_UNROLLING_87(idx, step, macro); UNROLL_INCR(idx, step, macro)
248 #define LOOP_UNROLLING_89(idx, step, macro) LOOP_UNROLLING_88(idx, step, macro); UNROLL_INCR(idx, step, macro)
249 #define LOOP_UNROLLING_90(idx, step, macro) LOOP_UNROLLING_89(idx, step, macro); UNROLL_INCR(idx, step, macro)
250 #define LOOP_UNROLLING_91(idx, step, macro) LOOP_UNROLLING_90(idx, step, macro); UNROLL_INCR(idx, step, macro)
251 #define LOOP_UNROLLING_92(idx, step, macro) LOOP_UNROLLING_91(idx, step, macro); UNROLL_INCR(idx, step, macro)
252 #define LOOP_UNROLLING_93(idx, step, macro) LOOP_UNROLLING_92(idx, step, macro); UNROLL_INCR(idx, step, macro)
253 #define LOOP_UNROLLING_94(idx, step, macro) LOOP_UNROLLING_93(idx, step, macro); UNROLL_INCR(idx, step, macro)
254 #define LOOP_UNROLLING_95(idx, step, macro) LOOP_UNROLLING_94(idx, step, macro); UNROLL_INCR(idx, step, macro)
255 #define LOOP_UNROLLING_96(idx, step, macro) LOOP_UNROLLING_95(idx, step, macro); UNROLL_INCR(idx, step, macro)
256 #define LOOP_UNROLLING_97(idx, step, macro) LOOP_UNROLLING_96(idx, step, macro); UNROLL_INCR(idx, step, macro)
257 #define LOOP_UNROLLING_98(idx, step, macro) LOOP_UNROLLING_97(idx, step, macro); UNROLL_INCR(idx, step, macro)
258 #define LOOP_UNROLLING_99(idx, step, macro) LOOP_UNROLLING_98(idx, step, macro); UNROLL_INCR(idx, step, macro)
259 #define LOOP_UNROLLING_100(idx, step, macro) LOOP_UNROLLING_99(idx, step, macro); UNROLL_INCR(idx, step, macro)
260 #define LOOP_UNROLLING_101(idx, step, macro) LOOP_UNROLLING_100(idx, step, macro); UNROLL_INCR(idx, step, macro)
261 #define LOOP_UNROLLING_102(idx, step, macro) LOOP_UNROLLING_101(idx, step, macro); UNROLL_INCR(idx, step, macro)
262 #define LOOP_UNROLLING_103(idx, step, macro) LOOP_UNROLLING_102(idx, step, macro); UNROLL_INCR(idx, step, macro)
263 #define LOOP_UNROLLING_104(idx, step, macro) LOOP_UNROLLING_103(idx, step, macro); UNROLL_INCR(idx, step, macro)
264 #define LOOP_UNROLLING_105(idx, step, macro) LOOP_UNROLLING_104(idx, step, macro); UNROLL_INCR(idx, step, macro)
265 #define LOOP_UNROLLING_106(idx, step, macro) LOOP_UNROLLING_105(idx, step, macro); UNROLL_INCR(idx, step, macro)
266 #define LOOP_UNROLLING_107(idx, step, macro) LOOP_UNROLLING_106(idx, step, macro); UNROLL_INCR(idx, step, macro)
267 #define LOOP_UNROLLING_108(idx, step, macro) LOOP_UNROLLING_107(idx, step, macro); UNROLL_INCR(idx, step, macro)
268 #define LOOP_UNROLLING_109(idx, step, macro) LOOP_UNROLLING_108(idx, step, macro); UNROLL_INCR(idx, step, macro)
269 #define LOOP_UNROLLING_110(idx, step, macro) LOOP_UNROLLING_109(idx, step, macro); UNROLL_INCR(idx, step, macro)
270 #define LOOP_UNROLLING_111(idx, step, macro) LOOP_UNROLLING_110(idx, step, macro); UNROLL_INCR(idx, step, macro)
271 #define LOOP_UNROLLING_112(idx, step, macro) LOOP_UNROLLING_111(idx, step, macro); UNROLL_INCR(idx, step, macro)
272 #define LOOP_UNROLLING_113(idx, step, macro) LOOP_UNROLLING_112(idx, step, macro); UNROLL_INCR(idx, step, macro)
273 #define LOOP_UNROLLING_114(idx, step, macro) LOOP_UNROLLING_113(idx, step, macro); UNROLL_INCR(idx, step, macro)
274 #define LOOP_UNROLLING_115(idx, step, macro) LOOP_UNROLLING_114(idx, step, macro); UNROLL_INCR(idx, step, macro)
275 #define LOOP_UNROLLING_116(idx, step, macro) LOOP_UNROLLING_115(idx, step, macro); UNROLL_INCR(idx, step, macro)
276 #define LOOP_UNROLLING_117(idx, step, macro) LOOP_UNROLLING_116(idx, step, macro); UNROLL_INCR(idx, step, macro)
277 #define LOOP_UNROLLING_118(idx, step, macro) LOOP_UNROLLING_117(idx, step, macro); UNROLL_INCR(idx, step, macro)
278 #define LOOP_UNROLLING_119(idx, step, macro) LOOP_UNROLLING_118(idx, step, macro); UNROLL_INCR(idx, step, macro)
279 #define LOOP_UNROLLING_120(idx, step, macro) LOOP_UNROLLING_119(idx, step, macro); UNROLL_INCR(idx, step, macro)
280 #define LOOP_UNROLLING_121(idx, step, macro) LOOP_UNROLLING_120(idx, step, macro); UNROLL_INCR(idx, step, macro)
281 #define LOOP_UNROLLING_122(idx, step, macro) LOOP_UNROLLING_121(idx, step, macro); UNROLL_INCR(idx, step, macro)
282 #define LOOP_UNROLLING_123(idx, step, macro) LOOP_UNROLLING_122(idx, step, macro); UNROLL_INCR(idx, step, macro)
283 #define LOOP_UNROLLING_124(idx, step, macro) LOOP_UNROLLING_123(idx, step, macro); UNROLL_INCR(idx, step, macro)
284 #define LOOP_UNROLLING_125(idx, step, macro) LOOP_UNROLLING_124(idx, step, macro); UNROLL_INCR(idx, step, macro)
285 #define LOOP_UNROLLING_126(idx, step, macro) LOOP_UNROLLING_125(idx, step, macro); UNROLL_INCR(idx, step, macro)
286 #define LOOP_UNROLLING_127(idx, step, macro) LOOP_UNROLLING_126(idx, step, macro); UNROLL_INCR(idx, step, macro)
287 #define LOOP_UNROLLING_128(idx, step, macro) LOOP_UNROLLING_127(idx, step, macro); UNROLL_INCR(idx, step, macro)
288 
289 #define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
290  { \
291  type idx = start; \
292  LOOP_UNROLLING_##num(idx, step, macro); \
293  }
294 #else // !defined(UNROLL_WITH_PRAGMA)
295 #define LOOP_UNROLLING_STR(type, idx, start, step, num, macro) \
296  { \
297  _Pragma("unroll") \
298  for(type idx = start; idx < (num * step); idx += step) \
299  { \
300  (macro); \
301  } \
302  }
303 #endif // !defined(UNROLL_WITH_PRAGMA)
304 #define LOOP_UNROLLING(type, idx, start, step, num, macro) LOOP_UNROLLING_STR(type, idx, start, step, num, macro)
305 
306 /** Get the get_global_id with partial N0. This function is useful when the dimension is not multiple of N0 and we need to use a partial N0
307  * to avoid out-of-bound read/write
308  *
309  * @note PARTIAL_N0 is used for get_global_id(n) = 0.
310  *
311  * @param[in] IDX get_global_id index (0,1 and 2 only)
312  * @param[in] N0 Number of elements read/written on the IDX direction
313  * @param[in] PARTIAL_N0 Number of elements read/written on the IDX direction for get_global_id(IDX) = 0. If zero,
314  * the Number of elements read/written on the IDX direction for get_global_id(IDX) = 0 is N0
315  */
316 #define GET_SPATIAL_IDX(IDX, N0, PARTIAL_N0) (max((int)(get_global_id(IDX) * N0 - (N0 - PARTIAL_N0) % N0), 0))
317 
318 /** Dot product integet 8bit function
319  *
320  * @note Performs: c += dot(a, b)
321  *
322  * @param[in] A_DATA_TYPE A (lhs) data type
323  * @param[in] B_DATA_TYPE B (rhs) data type
324  * @param[in] C_DATA_TYPE C (accumulator) data type
325  * @param[in] K0 Number of accumulations
326  * @param[in] a OpenCL vector a
327  * @param[in] b OpenCL vector b
328  * @param[in] c Scalar variable c
329  */
330 #define DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c)
331 #define DOT_PRODUCT_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, b, c) DOT_PRODUCT##K0##_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c)
332 #define DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
333  ({ \
334  c += (C_DATA_TYPE)(a) * (C_DATA_TYPE)(b); \
335  })
336 #if defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
337 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
338 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
339 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += dot((a), (b));
340 #elif defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_khr_integer_dot_product)
341 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)), (c));
342 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0), (c));
343 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c = arm_dot_acc((a), (b), (c));
344 #elif defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8) // defined(ARM_COMPUTE_OPENCL_DOT8_ENABLED) && defined(cl_arm_integer_dot_product_int8)
345 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s01, (A_DATA_TYPE##2)(0)), (B_DATA_TYPE##4)(((b).s01), (B_DATA_TYPE##2)(0)));
346 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((A_DATA_TYPE##4)((a).s012, (A_DATA_TYPE)0), (B_DATA_TYPE##4)(((b).s012), (B_DATA_TYPE)0));
347 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) c += arm_dot((a), (b));
348 #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
349 #define DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
350  ({ \
351  c += (C_DATA_TYPE)(a).s0 * (C_DATA_TYPE)(b).s0; \
352  c += (C_DATA_TYPE)(a).s1 * (C_DATA_TYPE)(b).s1; \
353  })
354 #define DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
355  ({ \
356  DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c); \
357  c += (C_DATA_TYPE)(a).s2 * (C_DATA_TYPE)(b).s2; \
358  })
359 #define DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, x, y, val) \
360  ({ \
361  val += (C_DATA_TYPE)(x).s0 * (C_DATA_TYPE)(y).s0; \
362  val += (C_DATA_TYPE)(x).s1 * (C_DATA_TYPE)(y).s1; \
363  val += (C_DATA_TYPE)(x).s2 * (C_DATA_TYPE)(y).s2; \
364  val += (C_DATA_TYPE)(x).s3 * (C_DATA_TYPE)(y).s3; \
365  })
366 #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8)
367 #define DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
368  ({ \
369  DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \
370  DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s4), ((b).s4), c); \
371  })
372 #define DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
373  ({ \
374  DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \
375  DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s45), ((b).s45), c); \
376  })
377 #define DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
378  ({ \
379  DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s0123), ((b).s0123), c); \
380  DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s456), ((b).s456), c); \
381  })
382 #define DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
383  ({ \
384  DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \
385  DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \
386  })
387 #define DOT_PRODUCT9_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
388  ({ \
389  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
390  DOT_PRODUCT1_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s8), ((b).s8), c); \
391  })
392 #define DOT_PRODUCT10_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
393  ({ \
394  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
395  DOT_PRODUCT2_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89), ((b).s89), c); \
396  })
397 #define DOT_PRODUCT11_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
398  ({ \
399  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
400  DOT_PRODUCT3_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89A), ((b).s89A), c); \
401  })
402 #define DOT_PRODUCT12_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
403  ({ \
404  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
405  DOT_PRODUCT4_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89AB), ((b).s89AB), c); \
406  })
407 #define DOT_PRODUCT13_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
408  ({ \
409  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
410  DOT_PRODUCT5_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABC), ((b).s89ABC), c); \
411  })
412 #define DOT_PRODUCT14_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
413  ({ \
414  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
415  DOT_PRODUCT6_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCD), ((b).s89ABCD), c); \
416  })
417 #define DOT_PRODUCT15_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
418  ({ \
419  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s01234567), ((b).s01234567), c); \
420  DOT_PRODUCT7_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).s89ABCDE), ((b).s89ABCDE), c); \
421  })
422 #define DOT_PRODUCT16_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, a, b, c) \
423  ({ \
424  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).lo), ((b).lo), c); \
425  DOT_PRODUCT8_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, ((a).hi), ((b).hi), c); \
426  })
427 
428 /** Dot product integet 8bit function
429  *
430  * @note Performs: c += dot(a, b)
431  *
432  * @param[in] A_DATA_TYPE A (lhs) data type
433  * @param[in] B_DATA_TYPE B (rhs) data type
434  * @param[in] C_DATA_TYPE C (accumulator) data type
435  * @param[in] K0 Number of accumulations
436  * @param[in] a OpenCL vector a
437  * @param[in] c Scalar variable c
438  */
439 #define REDUCE_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c)
440 #define REDUCE_INTEGER8_STR(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, c) DOT_PRODUCT_INTEGER8(A_DATA_TYPE, B_DATA_TYPE, C_DATA_TYPE, K0, a, (TILE_VECTOR_TYPE##K0(B_DATA_TYPE))1, c)
441 
442 /** Load a vector from global memory (tensor)
443  *
444  * @param[in] DATA_TYPE Data type
445  * @param[in] WIDTH Number of dst columns
446  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
447  * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
448  * @param[in] TENSOR Tensor basename
449  * @param[in] X Starting X position
450  * @param[in] Y Starting Y position
451  * @param[in] STRIDE_Y Stride Y (in bytes)
452  */
453 #define V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y)
454 #define V_LOAD_STR(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, Y, STRIDE_Y) V_LOAD_##TENSOR_TYPE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y)
455 #define V_LOAD_BUFFER(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) \
456  VLOAD(WIDTH) \
457  (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (Y) * (STRIDE_Y)))
458 #define V_LOAD_IMAGE(DATA_TYPE, WIDTH, TENSOR, X, Y, STRIDE_Y) READ_IMAGE2D(DATA_TYPE, CONVERT_VECTOR_SIZE_TO_PIXEL_UNIT(WIDTH), TENSOR##_img, (X) / 4, (Y))
459 
460 /** Load a tile from global memory (tensor)
461  *
462  * @param[in] DATA_TYPE Data type
463  * @param[in] HEIGHT Number of dst rows
464  * @param[in] WIDTH Number of dst columns
465  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
466  * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
467  * @param[in] TENSOR Tensor basename
468  * @param[in] X Starting X position
469  * @param[in] Y Starting Y position
470  * @param[in] YI_MULTIPLIER Parameter used to multiply the internal row increment (_i).
471  * In common cases should be 1 but it becomes useful when we want to load rows which are multiple of STRIDE_Y. (e.g. loading the weights of convolution layer).
472  * In this case the address calculation is performed as: (Y + _i * Y_MULTIPLIER) * STRIDE_Y
473  * @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.
474  * @param[out] dst Output tile
475  */
476 #define T_LOAD(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, Y, YI_MULTIPLIER, STRIDE_Y, dst) \
477  ({ \
478  LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \
479  { \
480  dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, ((Y) + _i * (int)(YI_MULTIPLIER)), STRIDE_Y); \
481  }) \
482  })
483 
484 /** Load a tile from global memory (tensor) using an indirect Y index tile
485  *
486  * @param[in] DATA_TYPE Data type
487  * @param[in] HEIGHT Number of dst rows
488  * @param[in] WIDTH Number of dst columns
489  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
490  * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
491  * @param[in] TENSOR Tensor basename
492  * @param[in] X Starting X position
493  * @param[in] STRIDE_Y Stride Y (in bytes)
494  * @param[in] indirect_y Indirect Y index tile
495  * @param[out] dst Output tile
496  */
497 #define T_LOAD_INDIRECT(DATA_TYPE, HEIGHT, WIDTH, TENSOR_TYPE, TENSOR, X, STRIDE_Y, indirect_y, dst) \
498  ({ \
499  LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \
500  { \
501  dst[_i].v = V_LOAD(DATA_TYPE, WIDTH, TENSOR_TYPE, TENSOR, X, (indirect_y[_i].v), STRIDE_Y); \
502  }) \
503  })
504 
505 /** Load a tile from global memory (tensor) using an indirect Y index tile and conditionally use a different length for the load
506  *
507  * @note If WIDTH1_CONDITION is true, the load will use the WIDTH1 length for the store
508  * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
509  *
510  * @param[in] DATA_TYPE Data type
511  * @param[in] HEIGHT Number of dst rows
512  * @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false
513  * @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true
514  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image).
515  * In case of cl_image, only WIDTH multiples of 4 are supported (4, 8, 16)
516  * @param[in] TENSOR Tensor basename
517  * @param[in] X Starting X position
518  * @param[in] STRIDE_Y Stride Y (in bytes) used to load each row.
519  * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store
520  * @param[out] dst Output tile
521  * @param[out] indirect_y Indirect Y index tile
522  */
523 #define T_LOAD_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, dst, indirect_y) \
524  ({ \
525  if(WIDTH1_CONDITION) \
526  { \
527  LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \
528  { \
529  VLOAD_PARTIAL(WIDTH0, WIDTH1) \
530  (dst[HEIGHT - 1 - _i].v, 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
531  }) \
532  } \
533  else \
534  { \
535  LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \
536  { \
537  dst[HEIGHT - 1 - _i].v = V_LOAD(DATA_TYPE, WIDTH0, TENSOR_TYPE, TENSOR, X, (indirect_y[HEIGHT - 1 - _i].v), STRIDE_Y); \
538  }) \
539  } \
540  })
541 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout
542  *
543  * @param[in] DATA_TYPE Data type
544  * @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension
545  * @param[in] TILE_WIDTH Number of elements to load from X (width) dimension
546  * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension
547  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
548  * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
549  * @param[in] TENSOR Tensor basename
550  * @param[in] B Starting batch index
551  * @param[in] Y Starting Y index
552  * @param[in] X Starting X index
553  * @param[in] C Starting C index
554  * @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension
555  * @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension
556  * @param[in] STRIDE_Y Stride Y (in bytes)
557  * @param[out] dst Output tile
558  */
559 #define T_LOAD_NHWC(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, dst) \
560  ({ \
561  LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
562  { \
563  LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
564  { \
565  int _src_y = (X) + _xk + ((Y) + _yk) * (TENSOR_WIDTH); \
566  _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \
567  int _src_valid_y = (((X) + _xk) >= 0 && ((X) + _xk) < (int)(TENSOR_WIDTH) && ((Y) + _yk) >= 0 && ((Y) + _yk) < (int)(TENSOR_HEIGHT)); \
568  if(_src_valid_y != 0) \
569  { \
570  dst[_xk + _yk * (TILE_WIDTH)].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \
571  } \
572  }) \
573  }) \
574  })
575 
576 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout with dilation for the X and Y increments
577  *
578  * @param[in] DATA_TYPE Data type
579  * @param[in] TILE_HEIGHT Number of elements to load from Y (height) dimension
580  * @param[in] TILE_WIDTH Number of elements to load from X (width) dimension
581  * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension
582  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
583  * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
584  * @param[in] TENSOR Tensor basename
585  * @param[in] B Starting batch index
586  * @param[in] Y Starting Y index
587  * @param[in] X Starting X index
588  * @param[in] C Starting C index
589  * @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension
590  * @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension
591  * @param[in] DILATION_X Dilation for the X increment
592  * @param[in] DILATION_Y Dilation for the Y increment
593  * @param[in] BOUNDARY_CHECK Boundary check flag. If true, it checks for any out-of-bound reads
594  * @param[out] dst Output tile
595  */
596 #define T_LOAD_NHWC_WITH_DILATION(DATA_TYPE, TILE_HEIGHT, TILE_WIDTH, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, DILATION_X, DILATION_Y, BOUNDARY_CHECK, dst) \
597  ({ \
598  LOOP_UNROLLING(int, _yk, 0, 1, TILE_HEIGHT, \
599  { \
600  LOOP_UNROLLING(int, _xk, 0, 1, TILE_WIDTH, \
601  { \
602  int _src_y = (X) + _xk * (DILATION_X); \
603  int _src_z = ((Y) + _yk * (DILATION_Y)); \
604  int _src_w = (B); \
605  bool _src_valid_y = (((X) + _xk * (DILATION_X)) >= 0) && (((X) + _xk * (DILATION_X)) < (int)(TENSOR_WIDTH)) && (((Y) + _yk * (DILATION_Y)) >= 0) && (((Y) + _yk * (DILATION_Y)) < (int)(TENSOR_HEIGHT)); \
606  if(!(BOUNDARY_CHECK)) \
607  { \
608  dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \
609  (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
610  } \
611  else \
612  { \
613  if(_src_valid_y) \
614  { \
615  dst[_xk + _yk * (TILE_WIDTH)].v = VLOAD(TILE_CHANNELS) \
616  (0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (C) * sizeof(DATA_TYPE) + (_src_y) * (TENSOR##_stride_y) + (_src_z) * (TENSOR##_stride_z) + (_src_w) * (TENSOR##_stride_w))); \
617  } \
618  } \
619  }) \
620  }) \
621  })
622 
623 /** Load a tile from global memory (tensor) when the tensor is stored using a NHWC layout using indirect X and Y coordinates
624  *
625  * @param[in] DATA_TYPE Data type
626  * @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
627  * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension
628  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
629  * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
630  * @param[in] TENSOR Tensor basename
631  * @param[in] B Starting batch index
632  * @param[in] Y Starting Y index
633  * @param[in] X Starting X index
634  * @param[in] C Starting C index
635  * @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension
636  * @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension
637  * @param[in] STRIDE_Y Stride Y (in bytes)
638  * @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
639  * @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
640  * @param[out] dst Output tile
641  */
642 #define T_LOAD_NHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, xi, yi, dst) \
643  ({ \
644  LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
645  { \
646  int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH); \
647  _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT); \
648  int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT)); \
649  if(_src_valid_y != 0) \
650  { \
651  dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \
652  } \
653  }) \
654  })
655 
656 #define T_LOAD2D_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, STRIDE_Y, yi, dst) \
657  ({ \
658  LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
659  { \
660  if(yi[_i].v >= 0) \
661  { \
662  dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, yi[_i].v, STRIDE_Y); \
663  } \
664  }) \
665  })
666 
667 /** Load a tile from global memory (tensor) when the tensor is stored using a NDHWC layout using indirect X, Y and Z coordinates
668  *
669  * @param[in] DATA_TYPE Data type
670  * @param[in] TILE_AREA Number of elements to load from Y (height) dimension * Number of elements to load from X (width) dimension
671  * @param[in] TILE_CHANNELS Number of elements to load from C (channel) dimension
672  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
673  * In case of cl_image, only TILE_CHANNELS multiples of 4 are supported (4, 8, 16)
674  * @param[in] TENSOR Tensor basename
675  * @param[in] B Starting batch index
676  * @param[in] Z Starting Z index
677  * @param[in] Y Starting Y index
678  * @param[in] X Starting X index
679  * @param[in] C Starting C index
680  * @param[in] TENSOR_WIDTH Number of elements to load from X (width) dimension
681  * @param[in] TENSOR_HEIGHT Number of elements to load from Y (height) dimension
682  * @param[in] TENSOR_DEPTH Number of elements to load from Z (depth) dimension
683  * @param[in] STRIDE_Y Stride Y (in bytes)
684  * @param[out] xi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect X coordinate
685  * @param[out] yi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Y coordinate
686  * @param[out] zi A tile with (TILE_WIDTH x TILE_HEIGHT) values with the indirect Z coordinate
687  * @param[out] dst Output tile
688  */
689 #define T_LOAD_NDHWC_INDIRECT(DATA_TYPE, TILE_AREA, TILE_CHANNELS, TENSOR_TYPE, TENSOR, B, Z, Y, X, C, TENSOR_WIDTH, TENSOR_HEIGHT, TENSOR_DEPTH, STRIDE_Y, xi, yi, zi, dst) \
690  ({ \
691  LOOP_UNROLLING(int, _i, 0, 1, TILE_AREA, \
692  { \
693  int _src_y = (X) + xi[_i].v + ((Y) + yi[_i].v) * (TENSOR_WIDTH) + ((Z) + zi[_i].v) * (TENSOR_WIDTH * TENSOR_HEIGHT); \
694  _src_y += (B) * (int)(TENSOR_WIDTH) * (int)(TENSOR_HEIGHT) * (int)(TENSOR_DEPTH); \
695  int _src_valid_y = (((X) + xi[_i].v) >= 0 && ((X) + xi[_i].v) < (int)(TENSOR_WIDTH) && ((Y) + yi[_i].v) >= 0 && ((Y) + yi[_i].v) < (int)(TENSOR_HEIGHT) \
696  && ((Z) + zi[_i].v) >= 0 && ((Z) + zi[_i].v) < (int)(TENSOR_DEPTH)); \
697  if(_src_valid_y != 0) \
698  { \
699  dst[_i].v = V_LOAD(DATA_TYPE, TILE_CHANNELS, TENSOR_TYPE, TENSOR, C, _src_y, STRIDE_Y); \
700  } \
701  }) \
702  })
703 
704 /** Store a tile to global memory (tensor) using an indirect Y index tile and conditionally use a different length for the store
705  *
706  * @note If WIDTH1_CONDITION is true, the store will use the WIDTH1 length for the store
707  * @note The vectors are stored in reverse order so the invalid rows are overwritten by the valid ones
708  *
709  * @param[in] DATA_TYPE Data type
710  * @param[in] HEIGHT Number of src rows
711  * @param[in] WIDTH0 Store width to use if WIDTH1_CONDITION = false
712  * @param[in] WIDTH1 Store width to use if WIDTH1_CONDITION = true
713  * @param[in] TENSOR_TYPE Type of cl_type used to store the tensor in global memory (BUFFER=cl_buffer, IMAGE=cl_image). Currently BUFFER only is supported
714  * cl_image is not supported.
715  * @param[in] TENSOR Tensor basename
716  * @param[in] X Starting X position
717  * @param[in] STRIDE_Y Stride Y (in bytes)
718  * @param[in] WIDTH1_CONDITION Condition to select the WIDTH1 store
719  * @param[in] src Input tile
720  * @param[in] indirect_y Indirect Y index tile
721  */
722 #define T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, HEIGHT, WIDTH0, WIDTH1, TENSOR_TYPE, TENSOR, X, STRIDE_Y, WIDTH1_CONDITION, src, indirect_y) \
723  ({ \
724  if(WIDTH1_CONDITION) \
725  { \
726  LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \
727  { \
728  VSTORE_PARTIAL(WIDTH0, WIDTH1) \
729  (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
730  }) \
731  } \
732  else \
733  { \
734  LOOP_UNROLLING(int, _i, 0, 1, HEIGHT, \
735  { \
736  VSTORE(WIDTH0) \
737  (CONVERT(src[HEIGHT - 1 - _i].v, VEC_DATA_TYPE(DATA_TYPE, WIDTH0)), 0, (__global DATA_TYPE *)(TENSOR##_ptr + TENSOR##_offset_first_element_in_bytes + (X) * sizeof(DATA_TYPE) + (indirect_y[HEIGHT - 1 - _i].v) * STRIDE_Y)); \
738  }) \
739  } \
740  })
741 
742 /** Offset correction for the QASYMM8 computation
743  *
744  * @param[in] ACC_DATA_TYPE Accumulator data type
745  * @param[in] M0 Number of src/dst rows
746  * @param[in] N0 Number of src/dst columns
747  * @param[in] K0 Number of src columns
748  * @param[in] SRC_OFFSET Source quantization offset
749  * @param[in] WEI_OFFSET Weights quantization shift
750  * @param[in] lhs LHS tile
751  * @param[in] rhs RHS tile
752  * @param[out] dst DST tile
753  */
754 #define T_OFFSET_CORRECTION(ACC_DATA_TYPE, M0, N0, K0, SRC_OFFSET, WEI_OFFSET, lhs, rhs, dst) \
755  ({ \
756  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
757  { \
758  ACC_DATA_TYPE _tm = 0; \
759  LOOP_UNROLLING(int, _k0, 0, 1, K0, \
760  { \
761  _tm += ((ACC_DATA_TYPE)lhs[_m0].s[_k0] * (ACC_DATA_TYPE)WEI_OFFSET); \
762  }) \
763  LOOP_UNROLLING(int, _n0, 0, 1, N0, \
764  { \
765  dst[_m0].s[_n0] += _tm; \
766  LOOP_UNROLLING(int, _k0, 0, 1, K0, \
767  { \
768  dst[_m0].s[_n0] += ((ACC_DATA_TYPE)rhs[_n0].s[_k0] * (ACC_DATA_TYPE)SRC_OFFSET); \
769  }) \
770  }) \
771  }) \
772  })
773 
774 /** 8-bit quantization with fixed-point scale
775  *
776  * @param[in] SRC_DATA_TYPE SRC data type
777  * @param[in] DST_DATA_TYPE DST data type
778  * @param[in] QUANTIZATION_TYPE Quantization type (PER_TENSOR or PER_CHANNEL)
779  * @param[in] M0 Number of src/dst rows
780  * @param[in] N0 Number of src/dst columns
781  * @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization
782  * @param[in] DST_SHIFT Quantization shift for the per-tensor quantization
783  * @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization
784  * @param[in] src Input tile
785  * @param[in] dst_multipliers Output multipliers tile for the per-channel quantization
786  * @param[in] dst_shifts Output shift tile for the per-channel quantization
787  * @param[out] dst Output tile
788  */
789 #define T_QUANTIZE8(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
790 #define T_QUANTIZE8_STR(SRC_DATA_TYPE, DST_DATA_TYPE, QUANTIZATION_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) T_QUANTIZE8_##QUANTIZATION_TYPE(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst)
791 
792 /** 8-bit per-tensor quantization with fixed-point scale
793  *
794  * @param[in] SRC_DATA_TYPE SRC data type
795  * @param[in] DST_DATA_TYPE DST data type
796  * @param[in] M0 Number of src/dst rows
797  * @param[in] N0 Number of src/dst columns
798  * @param[in] DST_OFFSET Quantization offset
799  * @param[in] DST_SHIFT Quantization shift for the per-tensor quantization
800  * @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization
801  * @param[in] src Input tile
802  * @param[in] dst_multipliers (unused)
803  * @param[in] dst_shifts (unused)
804  * @param[out] dst Output tile
805  */
806 #define T_QUANTIZE8_PER_TENSOR(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \
807  ({ \
808  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
809  { \
810  LOOP_UNROLLING(int, _n0, 0, 1, N0, \
811  { \
812  SRC_DATA_TYPE _tmp = 0; \
813  SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
814  _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
815  SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
816  long a_64 = (long)(_src); \
817  long b_64 = (long)(DST_MULTIPLIER); \
818  long ab_64 = a_64 * b_64; \
819  long mask1 = 1 << 30; \
820  long mask2 = 1 - (1 << 30); \
821  long is_positive_or_zero = ab_64 >= 0; \
822  long nudge = select(mask2, mask1, is_positive_or_zero); \
823  SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
824  _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
825  if(DST_SHIFT >= 0) \
826  { \
827  long mask = ((((int)1) << DST_SHIFT) - (long)1); \
828  long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
829  _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
830  } \
831  _tmp += DST_OFFSET; \
832  dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \
833  }) \
834  }) \
835  })
836 
837 /** 8-bit per-channel quantization with fixed-point scale
838  *
839  * @param[in] SRC_DATA_TYPE SRC data type
840  * @param[in] DST_DATA_TYPE DST data type
841  * @param[in] M0 Number of src/dst rows
842  * @param[in] N0 Number of src/dst columns
843  * @param[in] DST_OFFSET Quantization offset
844  * @param[in] DST_SHIFT (unused)
845  * @param[in] DST_MULTIPLIER (unused)
846  * @param[in] src Input tile
847  * @param[in] dst_multipliers Output multipliers tile for the per-channel quantization
848  * @param[in] dst_shifts Output shift tile for the per-channel quantization
849  * @param[out] dst Output tile
850  */
851 #define T_QUANTIZE8_PER_CHANNEL(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst_multipliers, dst_shifts, dst) \
852  ({ \
853  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
854  { \
855  LOOP_UNROLLING(int, _n0, 0, 1, N0, \
856  { \
857  SRC_DATA_TYPE _tmp = 0; \
858  SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
859  SRC_DATA_TYPE _dst_multiplier = dst_multipliers[0].s[_n0]; \
860  SRC_DATA_TYPE _dst_shift = dst_shifts[0].s[_n0]; \
861  _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-_dst_shift)), ((SRC_DATA_TYPE)_dst_shift < (SRC_DATA_TYPE)0)); \
862  SRC_DATA_TYPE overflow = _src == _dst_multiplier && _src == INT_MIN; \
863  long a_64 = (long)(_src); \
864  long b_64 = (long)(_dst_multiplier); \
865  long ab_64 = a_64 * b_64; \
866  long mask1 = 1 << 30; \
867  long mask2 = 1 - (1 << 30); \
868  long is_positive_or_zero = ab_64 >= 0; \
869  long nudge = select(mask2, mask1, is_positive_or_zero); \
870  SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
871  _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
872  if(_dst_shift >= 0) \
873  { \
874  long mask = ((((int)1) << _dst_shift) - (int)1); \
875  long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
876  _tmp = (_tmp & mask) > threshold ? (_tmp >> _dst_shift) + (int)1 : (_tmp >> _dst_shift); \
877  } \
878  _tmp += DST_OFFSET; \
879  dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \
880  }) \
881  }) \
882  })
883 
884 /** Quantized the 8-bit tile with fixed-point scale for asymmetric
885  *
886  * @param[in] SRC_DATA_TYPE SRC data type
887  * @param[in] DST_DATA_TYPE DST data type
888  * @param[in] M0 Number of src/dst rows
889  * @param[in] N0 Number of src/dst columns
890  * @param[in] DST_OFFSET Quantization offset used for both the per-tensor and per-channel quantization
891  * @param[in] DST_SHIFT Quantization shift for the per-tensor quantization
892  * @param[in] DST_MULTIPLIER Quantization multiplier for the per-tensor quantization
893  * @param[in] src Input tile
894  * @param[out] dst Output tile
895  */
896 #define T_QUANTIZE8_ASYMMETRIC(SRC_DATA_TYPE, DST_DATA_TYPE, M0, N0, DST_OFFSET, DST_SHIFT, DST_MULTIPLIER, src, dst) \
897  ({ \
898  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
899  { \
900  LOOP_UNROLLING(int, _n0, 0, 1, N0, \
901  { \
902  SRC_DATA_TYPE _tmp = 0; \
903  SRC_DATA_TYPE _src = src[_m0].s[_n0]; \
904  _src *= select((SRC_DATA_TYPE)1, ((SRC_DATA_TYPE)1 << (SRC_DATA_TYPE)(-DST_SHIFT)), ((SRC_DATA_TYPE)DST_SHIFT < (SRC_DATA_TYPE)0)); \
905  SRC_DATA_TYPE overflow = _src == DST_MULTIPLIER && _src == INT_MIN; \
906  long a_64 = (long)(_src); \
907  long b_64 = (long)(DST_MULTIPLIER); \
908  long ab_64 = a_64 * b_64; \
909  long mask1 = 1 << 30; \
910  long mask2 = 1 - (1 << 30); \
911  long is_positive_or_zero = ab_64 >= 0; \
912  long nudge = select(mask2, mask1, is_positive_or_zero); \
913  SRC_DATA_TYPE ab_x2_high32 = CONVERT((ab_64 + nudge) / (long)(1ll << 31), SRC_DATA_TYPE); \
914  _tmp = select(ab_x2_high32, (SRC_DATA_TYPE)INT_MAX, overflow); \
915  if(DST_SHIFT >= 0) \
916  { \
917  long mask = ((((int)1) << DST_SHIFT) - (int)1); \
918  long threshold = _tmp < (int)0 ? (mask >> 1) + (long)1 : (mask >> 1) + 0; \
919  _tmp = (_tmp & mask) > threshold ? (_tmp >> DST_SHIFT) + (int)1 : (_tmp >> DST_SHIFT); \
920  } \
921  _tmp += DST_OFFSET; \
922  dst[_m0].s[_n0] = CONVERT_SAT(_tmp, DST_DATA_TYPE); \
923  }) \
924  }) \
925  })
926 
927 /** Conditional rowset (memset by row)
928  *
929  * @note Set the row to VALUE_TO_SET if the corresponding mask == 0
930  *
931  * @param[in] DATA_TYPE Data type
932  * @param[in] M0 Number of LHS rows
933  * @param[in] N0 Number of LHS columns
934  * @param[in] VALUE_TO_SET Value to set the row
935  * @param[in, out] a Input/output tile
936  * @param[out] mask Mask to check for setting the row to VALUE_TO_SET
937  */
938 #define T_ROWSET_MASK(DATA_TYPE, M0, N0, VALUE_TO_SET, a, mask) \
939  ({ \
940  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
941  { \
942  LOOP_UNROLLING(int, _n0, 0, 1, N0, \
943  { \
944  a[_m0].s[_n0] = select((DATA_TYPE)(a[_m0].s[_n0]), (DATA_TYPE)(VALUE_TO_SET), (SELECT_DATA_TYPE(DATA_TYPE))(mask[_m0].v == (DATA_TYPE)0)); \
945  }) \
946  }) \
947  })
948 
949 /** Element-wise activation for floating point types
950  *
951  * @note Performs: activation(LHS) = DST
952  *
953  * @param[in] DATA_TYPE SRC/DST data type
954  * @param[in] M0 Number of SRC/DST rows
955  * @param[in] N0 Number of SRC/DST columns
956  * @param[in] ACTIVATION_TYPE Activation type
957  * @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)
958  * @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)
959  * @param[out] src SRC tile
960  * @param[out] dst DST tile
961  */
962 #define T_ACTIVATION(DATA_TYPE, M0, N0, ACTIVATION_TYPE, A_VAL, B_VAL, src, dst) \
963  ({ \
964  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
965  { \
966  dst[_m0].v = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, N0, src[_m0].v, A_VAL, B_VAL); \
967  }) \
968  })
969 
970 // RELU Activation
971 #define relu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (max((DATA_TYPE)ZERO_VALUE, x))
972 // Bounded RELU Activation
973 #define brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min((DATA_TYPE)A_VAL, max((DATA_TYPE)ZERO_VALUE, x)))
974 // Lower Upper Bounded RELU Activation
975 #define lu_brelu_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (min(max(x, (DATA_TYPE)B_VAL), (DATA_TYPE)A_VAL))
976 // Hard Swish Activation
977 #define hard_swish_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x * ((min(max((DATA_TYPE)(x + (DATA_TYPE)3.f), (DATA_TYPE)0.f), (DATA_TYPE)6.f)) * (DATA_TYPE)0.166666667f))
978 // Identity Activation
979 #define identity_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) (x)
980 
981 #define ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) op##_op_quantized(DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
982 #define ACTIVATION_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x) ACT_OP_QUANTIZED(op, DATA_TYPE, VEC_SIZE, ZERO_VALUE, A_VAL, B_VAL, x)
983 
984 #define V_ADD(A_VAL, B_VAL) ((A_VAL) + (B_VAL))
985 #define V_DIV(A_VAL, B_VAL) ((A_VAL) / (B_VAL))
986 
987 /** Element-wise activation for quantized types
988  *
989  * @note Performs: activation(LHS) = DST
990  *
991  * @param[in] DATA_TYPE SRC/DST data type
992  * @param[in] M0 Number of SRC/DST rows
993  * @param[in] N0 Number of SRC/DST columns
994  * @param[in] ACTIVATION_TYPE Activation type
995  * @param[in] ZERO_VALUE The zero value to consider in the computation
996  * @param[in] A_VAL A value used for the activation (e.g. tanh_op, brelu,..)
997  * @param[in] B_VAL B value used for the activation (e.g. tanh_op, brelu,..)
998  * @param[out] src SRC tile
999  * @param[out] dst DST tile
1000  */
1001 #define T_ACTIVATION_QUANTIZED(DATA_TYPE, M0, N0, ACTIVATION_TYPE, ZERO_VALUE, A_VAL, B_VAL, src, dst) \
1002  ({ \
1003  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1004  { \
1005  dst[_m0].v = ACTIVATION_QUANTIZED(ACTIVATION_TYPE, DATA_TYPE, N0, ZERO_VALUE, A_VAL, B_VAL, src[_m0].v); \
1006  }) \
1007  })
1008 
1009 /** Element-wise addition between two tiles
1010  *
1011  * @note Performs: LHS + RHS = DST
1012  *
1013  * @param[in] DATA_TYPE LHS/RHS/DST data type
1014  * @param[in] M0 Number of LHS rows
1015  * @param[in] N0 Number of LHS columns
1016  * @param[in] lhs LHS tile
1017  * @param[in] rhs Constant RHS tile
1018  * @param[out] dst DST tile
1019  */
1020 #define T_ADD(DATA_TYPE, M0, N0, lhs, rhs, dst) \
1021  ({ \
1022  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1023  { \
1024  dst[_m0].v = lhs[_m0].v + rhs[_m0].v; \
1025  }) \
1026  })
1027 
1028 /** Element-wise addition with a constant value
1029  *
1030  * @note Performs: LHS + constant = DST
1031  *
1032  * @param[in] DATA_TYPE LHS/RHS/DST data type
1033  * @param[in] M0 Number of LHS rows
1034  * @param[in] N0 Number of LHS columns
1035  * @param[in] lhs LHS tile
1036  * @param[in] rhs_constant Constant value
1037  * @param[out] dst DST tile
1038  */
1039 #define T_ADD_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
1040  ({ \
1041  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1042  { \
1043  dst[_m0].v = lhs[_m0].v + (DATA_TYPE)rhs_constant; \
1044  }) \
1045  })
1046 
1047 #define T_ELTWISE_BROADCAST_ADD_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1048 #define T_ELTWISE_BROADCAST_DIV_X(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE_BROADCAST_X(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1049 
1050 /** Element-wise scale with a constant value
1051  *
1052  * @note Performs: LHS * constant = DST
1053  *
1054  * @param[in] DATA_TYPE LHS/RHS/DST data type
1055  * @param[in] M0 Number of LHS rows
1056  * @param[in] N0 Number of LHS columns
1057  * @param[in] lhs LHS tile
1058  * @param[in] rhs_constant Constant value
1059  * @param[out] dst DST tile
1060  */
1061 #define T_SCALE_CONSTANT(DATA_TYPE, M0, N0, lhs, rhs_constant, dst) \
1062  ({ \
1063  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1064  { \
1065  dst[_m0].v = lhs[_m0].v * (DATA_TYPE)rhs_constant; \
1066  }) \
1067  })
1068 
1069 /** Element-wise operation with RHS broadcasted (RHS has the X dimension only)
1070  *
1071  * @note Performs: LHS OP RHS[broadcasted] = DST
1072  * @note Both tiles must have same data type
1073  *
1074  * @param[in] T_ELWISE_OP Elementwise operator to perform
1075  * @param[in] DST_DATA_TYPE DST data type
1076  * @param[in] M0 Number of LHS rows
1077  * @param[in] N0 Number of LHS columns
1078  * @param[in] lhs LHS tile
1079  * @param[in] rhs RHS tile
1080  * @param[out] dst DST tile
1081  */
1082 #define T_ELTWISE_BROADCAST_X(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
1083  ({ \
1084  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1085  { \
1086  dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \
1087  }) \
1088  })
1089 
1090 #define T_ELTWISE_ADD(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_ADD, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1091 #define T_ELTWISE_DIV(DST_DATA_TYPE, M0, N0, lhs, rhs, dst) T_ELTWISE(V_DIV, DST_DATA_TYPE, M0, N0, lhs, rhs, dst)
1092 
1093 /** Element-wise operation between two tiles (LHS and RHS)
1094  *
1095  * @note Performs: LHS OP RHS = DST
1096  * @note Both tiles must have same data type
1097  *
1098  * @param[in] T_ELWISE_OP Elementwise operator to perform
1099  * @param[in] DST_DATA_TYPE DST data type
1100  * @param[in] M0 Number of LHS rows
1101  * @param[in] N0 Number of LHS columns
1102  * @param[in] lhs LHS tile
1103  * @param[in] rhs RHS tile
1104  * @param[out] dst DST tile
1105  */
1106 #define T_ELTWISE(T_ELWISE_OP, DST_DATA_TYPE, M0, N0, lhs, rhs, dst) \
1107  ({ \
1108  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1109  { \
1110  dst[_m0].v = T_ELWISE_OP(CONVERT(lhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0)), CONVERT(rhs[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \
1111  }) \
1112  })
1113 
1114 /** Floor operation on a tile
1115  *
1116  * @note Performs: floor(SRC) = DST
1117  * @note Both tiles must have same data type
1118  *
1119  * @param[in] DST_DATA_TYPE DST data type
1120  * @param[in] M0 Number of SRC rows
1121  * @param[in] N0 Number of SRC columns
1122  * @param[in] src LHS tile
1123  * @param[out] dst DST tile
1124  */
1125 #define T_FLOOR(DST_DATA_TYPE, M0, N0, src, dst) \
1126  ({ \
1127  LOOP_UNROLLING(int, _m0, 0, 1, M0, \
1128  { \
1129  dst[_m0].v = floor(CONVERT(src[_m0].v, VEC_DATA_TYPE(DST_DATA_TYPE, N0))); \
1130  }) \
1131  })
1132 
1133 /** Matrix multiplication
1134  *
1135  * @note Performs: LHS X RHS + DST = DST
1136  *
1137  * @param[in] LHS_DATA_TYPE LHS tile data type
1138  * @param[in] RHS_DATA_TYPE RHS tile data type
1139  * @param[in] DST_DATA_TYPE RHS tile data type
1140  * @param[in] M0 Number of LHS rows
1141  * @param[in] N0 Number of RHS columns
1142  * @param[in] K0 Number of LHS columns
1143  * @param[in] LHS_LAYOUT LHS layout (T= transposed, NT= not transposed)
1144  * @param[in] RHS_LAYOUT RHS layout (T= transposed, NT= not transposed)
1145  * @param[in] lhs LHS tile
1146  * @param[in] rhs RHS tile
1147  * @param[in, out] dst DST tile
1148  */
1149 #define T_MMUL(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, LHS_LAYOUT, RHS_LAYOUT, lhs, rhs, dst) T_MMUL_##LHS_LAYOUT##_##RHS_LAYOUT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1150 #define T_MMUL_NT_T(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_##LHS_DATA_TYPE##_##RHS_DATA_TYPE##_##DST_DATA_TYPE(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1151 #define T_MMUL_NT_T_float_float_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1152 #define T_MMUL_NT_T_half_half_float(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1153 #define T_MMUL_NT_T_half_half_half(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1154 #define T_MMUL_NT_T_char_char_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1155 #define T_MMUL_NT_T_uchar_uchar_uint(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1156 #define T_MMUL_NT_T_uchar_uchar_int(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst)
1157 #define T_MMUL_NT_T_FLOAT(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \
1158  { \
1159  LOOP_UNROLLING(int, _m, 0, 1, M0, \
1160  { \
1161  LOOP_UNROLLING(int, _n, 0, 1, N0, \
1162  { \
1163  LOOP_UNROLLING(int, _k, 0, 1, K0, \
1164  { \
1165  dst[_m].s[_n] = fma((DST_DATA_TYPE)(lhs[_m].s[_k]), (DST_DATA_TYPE)(rhs[_n].s[_k]), dst[_m].s[_n]); \
1166  }) \
1167  }) \
1168  }) \
1169  }
1170 
1171 #define T_MMUL_NT_T_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, M0, N0, K0, lhs, rhs, dst) \
1172  ({ \
1173  LOOP_UNROLLING(int, _m, 0, 1, M0, \
1174  { \
1175  LOOP_UNROLLING(int, _n, 0, 1, N0, \
1176  { \
1177  DOT_PRODUCT_INTEGER8(LHS_DATA_TYPE, RHS_DATA_TYPE, DST_DATA_TYPE, K0, (lhs[_m].v), (rhs[_n].v), dst[_m].s[_n]); \
1178  }) \
1179  }) \
1180  })
1181 
1182 #endif // ARM_COMPUTE_TILE_HELPERS_H