Compute Library
 22.11
gemm_helpers.h
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2019-2021 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
25 #include "helpers.h"
26 
27 /** Utility macro to access a vector with the scalar positions
28  *
29  * Supported cases are: Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
30  *
31  * @param[in] offset The offset within the vector. Offset can only be of the same size of the OpenCL vector (2,3,4,8,16)
32  * @param[in] n0 The number of consecutive columns to access. n0 + offset must be <= 16
33  * @param[in] x Vector to access
34  * @{
35  */
36 #define SCALAR_ACCESS_STR(offset, n0, x) scalar_access_##offset##_##n0(x)
37 #define SCALAR_ACCESS(offset, n0, x) SCALAR_ACCESS_STR(offset, n0, x)
38 
39 // offset == 0
40 #define scalar_access_0_1(x) ((x).s0)
41 #define scalar_access_0_2(x) ((x).s01)
42 #define scalar_access_0_3(x) ((x).s012)
43 #define scalar_access_0_4(x) ((x).s0123)
44 #define scalar_access_0_8(x) ((x).s01234567)
45 #define scalar_access_0_16(x) ((x).s0123456789ABCDEF)
46 
47 // offset == 1
48 #define scalar_access_1_1(x) ((x).s1)
49 #define scalar_access_1_2(x) ((x).s12)
50 #define scalar_access_1_3(x) ((x).s123)
51 #define scalar_access_1_4(x) ((x).s1234)
52 #define scalar_access_1_8(x) ((x).s12345678)
53 
54 // offset == 2
55 #define scalar_access_2_1(x) ((x).s2)
56 #define scalar_access_2_2(x) ((x).s23)
57 #define scalar_access_2_3(x) ((x).s234)
58 #define scalar_access_2_4(x) ((x).s2345)
59 #define scalar_access_2_8(x) ((x).s23456789)
60 
61 // offset == 3
62 #define scalar_access_3_1(x) ((x).s3)
63 #define scalar_access_3_2(x) ((x).s34)
64 #define scalar_access_3_3(x) ((x).s345)
65 #define scalar_access_3_4(x) ((x).s3456)
66 #define scalar_access_3_8(x) ((x).s3456789A)
67 
68 // offset == 4
69 #define scalar_access_4_1(x) ((x).s4)
70 #define scalar_access_4_2(x) ((x).s45)
71 #define scalar_access_4_3(x) ((x).s456)
72 #define scalar_access_4_4(x) ((x).s4567)
73 #define scalar_access_4_8(x) ((x).s456789AB)
74 
75 // offset == 8
76 #define scalar_access_8_1(x) ((x).s8)
77 #define scalar_access_8_2(x) ((x).s89)
78 #define scalar_access_8_3(x) ((x).s89A)
79 #define scalar_access_8_4(x) ((x).s89AB)
80 #define scalar_access_8_8(x) ((x).s89ABCDEF)
81 
82 // offset == 12
83 #define scalar_access_12_1(x) ((x).sC)
84 #define scalar_access_12_2(x) ((x).sCD)
85 #define scalar_access_12_3(x) ((x).sCDE)
86 #define scalar_access_12_4(x) ((x).sCDEF)
87 
88 // offset == 16
89 #define scalar_access_16_1(x) ((x).sF)
90 
91 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) without allocating variables.
92  * @name LOAD_TENSOR_ROW_n
93  *
94  * @param[in] N0 The number of columns to load
95  * @param[in] DATA_TYPE The data type of variables
96  * @param[in] BASENAME The basename of the destination variables for the loaded rows
97  * @param[in] PTR The base pointer
98  * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
99  * @param[in] STRIDE_Y The stride value in y-axis direction
100  * @param[in] Z The z-axis offset vector
101  * @{
102  */
103 #define LOAD_TENSOR_ROW_0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
104  ({})
105 
106 #define LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
107  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##0) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 0 * STRIDE_Y + Z##0));
108 
109 #define LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
110  LOAD_TENSOR_ROW_1(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
111  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##1) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 1 * STRIDE_Y + Z##1));
112 
113 #define LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
114  LOAD_TENSOR_ROW_2(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
115  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##2) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 2 * STRIDE_Y + Z##2));
116 
117 #define LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
118  LOAD_TENSOR_ROW_3(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
119  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##3) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 3 * STRIDE_Y + Z##3));
120 
121 #define LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
122  LOAD_TENSOR_ROW_4(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
123  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##4) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 4 * STRIDE_Y + Z##4));
124 
125 #define LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
126  LOAD_TENSOR_ROW_5(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
127  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##5) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 5 * STRIDE_Y + Z##5));
128 
129 #define LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
130  LOAD_TENSOR_ROW_6(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
131  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##6) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 6 * STRIDE_Y + Z##6));
132 
133 #define LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
134  LOAD_TENSOR_ROW_7(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
135  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##7) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 7 * STRIDE_Y + Z##7));
136 
137 #define LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
138  LOAD_TENSOR_ROW_8(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
139  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##8) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 8 * STRIDE_Y + Z##8));
140 
141 #define LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
142  LOAD_TENSOR_ROW_9(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
143  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##9) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 9 * STRIDE_Y + Z##9));
144 
145 #define LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
146  LOAD_TENSOR_ROW_10(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
147  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##A) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 10 * STRIDE_Y + Z##A));
148 
149 #define LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
150  LOAD_TENSOR_ROW_11(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
151  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##B) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 11 * STRIDE_Y + Z##B));
152 
153 #define LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
154  LOAD_TENSOR_ROW_12(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
155  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##C) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 12 * STRIDE_Y + Z##C));
156 
157 #define LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
158  LOAD_TENSOR_ROW_13(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
159  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##D) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 13 * STRIDE_Y + Z##D));
160 
161 #define LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
162  LOAD_TENSOR_ROW_14(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
163  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##E) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 14 * STRIDE_Y + Z##E));
164 
165 #define LOAD_TENSOR_ROW_16(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
166  LOAD_TENSOR_ROW_15(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) \
167  SCALAR_ACCESS(COL_OFFSET, N0, BASENAME##F) = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + 15 * STRIDE_Y + Z##F));
168 /** @}*/ // end of group LOAD_TENSOR_ROW_n
169 
170 /** Load tensor (consecutive rows and columns) with Z offset.
171  * @name LOAD_TENSOR
172  *
173  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
174  * The data to load is expected to have consecutive names for each row.
175  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
176  * The Z offset is expected to have consecutive names.
177  * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
178  *
179  * @param[in] M0 The number of consecutive rows
180  * @param[in] N0 The number of consecutive columns
181  * @param[in] DATA_TYPE The data type of the target
182  * @param[in] BASENAME The basename of the result variables
183  * @param[in] PTR The base pointer for the data
184  * @param[in] COL_OFFSET The column vector offset. COL_OFFSET + N0 must be <= 16
185  * @param[in] STRIDE_Y The stride in y-axis direction
186  * @param[in] Z The z-axis offset vector
187  * @{
188  */
189 #define LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
190 #define LOAD_TENSOR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z) LOAD_TENSOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, COL_OFFSET, STRIDE_Y, Z)
191 /** @} */ // end of group LOAD_TENSOR
192 
193 /** Load 2D tensor (consecutive rows and columns) with Z offset.
194  * @name LOAD_TENSOR_M0Xn
195  *
196  * @param[in] M0 The number of rows to load [0-16]
197  * @param[in] N0 The number of columns to load [0-16]
198  * @param[in] DATA_TYPE The data type of variables
199  * @param[in] BASENAME The basename of the destination variables for the loaded rows
200  * @param[in] PTR The base pointer
201  * @param[in] STRIDE_Y The stride value in y-axis direction
202  * @param[in] Z The z-axis offset vector
203  * @{
204  */
205 #define LOAD_TENSOR_M0X0(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
206  ({})
207 
208 #define LOAD_TENSOR_M0X1(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
209  LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
210 
211 #define LOAD_TENSOR_M0X2(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
212  LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
213 
214 #define LOAD_TENSOR_M0X3(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
215  LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
216 
217 #define LOAD_TENSOR_M0X4(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
218  LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
219 
220 #define LOAD_TENSOR_M0X5(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
221  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
222  LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
223 
224 #define LOAD_TENSOR_M0X6(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
225  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
226  LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
227 
228 #define LOAD_TENSOR_M0X7(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
229  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
230  LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 4 * sizeof(DATA_TYPE), 4, src_stride_y, zin);
231 
232 #define LOAD_TENSOR_M0X8(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
233  LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
234 
235 #define LOAD_TENSOR_M0X9(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
236  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \
237  LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
238 
239 #define LOAD_TENSOR_M0X10(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
240  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
241  LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
242 
243 #define LOAD_TENSOR_M0X11(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
244  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
245  LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
246 
247 #define LOAD_TENSOR_M0X12(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
248  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
249  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin);
250 
251 #define LOAD_TENSOR_M0X13(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
252  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
253  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
254  LOAD_TENSOR(M0, 1, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
255 
256 #define LOAD_TENSOR_M0X14(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
257  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr 0, src_stride_y, zin); \
258  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
259  LOAD_TENSOR(M0, 2, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
260 
261 #define LOAD_TENSOR_M0X15(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
262  LOAD_TENSOR(M0, 8, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin); \
263  LOAD_TENSOR(M0, 4, DATA_TYPE, a, input_ptr + 8 * sizeof(DATA_TYPE), 8, src_stride_y, zin); \
264  LOAD_TENSOR(M0, 3, DATA_TYPE, a, input_ptr + 12 * sizeof(DATA_TYPE), 12, src_stride_y, zin);
265 
266 #define LOAD_TENSOR_M0X16(M0, N0, DATA_TYPE, a, input_ptr, src_stride_y, zin) \
267  LOAD_TENSOR(M0, N0, DATA_TYPE, a, input_ptr, 0, src_stride_y, zin);
268 /** @}*/ // end of group LOAD_TENSOR_M0Xn
269 
270 /** Load 2D tensor (consecutive rows and columns) with Z offset.
271  * @name LOAD_TENSOR_M0XN0
272  *
273  * @param[in] M0 The number of consecutive rows [0-16]
274  * @param[in] N0 The number of consecutive columns [0-16]
275  * @param[in] DATA_TYPE The data type of the target
276  * @param[in] BASENAME The basename of the result variables
277  * @param[in] PTR The base pointer for the data
278  * @param[in] STRIDE_Y The stride in y-axis direction
279  * @param[in] Z The z-axis offset vector
280  * @{
281  */
282 #define LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0X##N0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
283 #define LOAD_TENSOR_M0XN0(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z) LOAD_TENSOR_M0XN0_STR(M0, N0, DATA_TYPE, BASENAME, PTR, STRIDE_Y, Z)
284 
285 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
286  * @name LOAD_ROW_n
287  *
288  * @param[in] N0 The number of columns to load
289  * @param[in] DATA_TYPE The data type of variables
290  * @param[in] BASENAME The basename of the destination variables for the loaded rows
291  * @param[in] PTR The base pointer
292  * @param[in] OFFSET The offset within a row
293  * @param[in] STRIDE_Y The stride value in y-axis direction
294  * @param[in] Z The z-axis offset vector
295  * @{
296  */
297 #define LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
298  VEC_DATA_TYPE(DATA_TYPE, N0) \
299  BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
300 
301 #define LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
302  LOAD_ROW_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
303  VEC_DATA_TYPE(DATA_TYPE, N0) \
304  BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
305 
306 #define LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
307  LOAD_ROW_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
308  VEC_DATA_TYPE(DATA_TYPE, N0) \
309  BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
310 
311 #define LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
312  LOAD_ROW_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
313  VEC_DATA_TYPE(DATA_TYPE, N0) \
314  BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
315 
316 #define LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
317  LOAD_ROW_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
318  VEC_DATA_TYPE(DATA_TYPE, N0) \
319  BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
320 
321 #define LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
322  LOAD_ROW_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
323  VEC_DATA_TYPE(DATA_TYPE, N0) \
324  BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
325 
326 #define LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
327  LOAD_ROW_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
328  VEC_DATA_TYPE(DATA_TYPE, N0) \
329  BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
330 
331 #define LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
332  LOAD_ROW_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
333  VEC_DATA_TYPE(DATA_TYPE, N0) \
334  BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
335 
336 #define LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
337  LOAD_ROW_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
338  VEC_DATA_TYPE(DATA_TYPE, N0) \
339  BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
340 
341 #define LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
342  LOAD_ROW_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
343  VEC_DATA_TYPE(DATA_TYPE, N0) \
344  BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
345 
346 #define LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
347  LOAD_ROW_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
348  VEC_DATA_TYPE(DATA_TYPE, N0) \
349  BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
350 
351 #define LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
352  LOAD_ROW_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
353  VEC_DATA_TYPE(DATA_TYPE, N0) \
354  BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
355 
356 #define LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
357  LOAD_ROW_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
358  VEC_DATA_TYPE(DATA_TYPE, N0) \
359  BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
360 
361 #define LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
362  LOAD_ROW_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
363  VEC_DATA_TYPE(DATA_TYPE, N0) \
364  BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
365 
366 #define LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
367  LOAD_ROW_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
368  VEC_DATA_TYPE(DATA_TYPE, N0) \
369  BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
370 
371 #define LOAD_ROW_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
372  LOAD_ROW_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
373  VEC_DATA_TYPE(DATA_TYPE, N0) \
374  BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
375 
376 /** @}*/ // end of group LOAD_ROW_n
377 
378 /** Load Blocks (consecutive rows and columns) with Z offset.
379  * @name LOAD_BLOCK
380  *
381  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
382  * The data to load is expected to have consecutive names for each row.
383  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
384  * The Z offset is expected to have consecutive names.
385  * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
386  *
387  * @param[in] M0 The number of consecutive rows
388  * @param[in] N0 The number of consecutive columns
389  * @param[in] DATA_TYPE The data type of the target
390  * @param[in] BASENAME The basename of the result variables
391  * @param[in] PTR The base pointer for the data
392  * @param[in] OFFSET The offset within a row
393  * @param[in] STRIDE_Y The stride in y-axis direction
394  * @param[in] Z The z-axis offset vector
395  * @{
396  */
397 #define LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
398 #define LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
399 /** @} */ // end of group LOAD_BLOCK
400 
401 /** Partially load the 0 to (n-1)th rows of the given variables
402  * @name LOAD_ROW_PARTIAL_n
403  * Within each row, load the lower @p LOAD_N0 elements of vectors of width @p N0
404  *
405  * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
406  *
407  * @param[in] N0 The width of the passed in vector. Supported: 1, 2, 3, 4, 8, 16
408  * @param[in] LOAD_N0 The **lower** size of the vectors to load. Supported: [1-16 and <= @p N0
409  * @param[in] DATA_TYPE The data type of the vectors
410  * @param[in] BASENAME The basename of the variables
411  * @param[in] PTR The base pointer
412  * @param[in] OFFSET The offset within a row
413  * @param[in] STRIDE_Y The stride value in y-axis direction
414  * @param[in] Z The offset in z-axis direction
415  * @{
416  */
417 #define LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
418  VLOAD_PARTIAL(N0, LOAD_N0) \
419  (BASENAME##0, 0, (__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y + Z##0));
420 
421 #define LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
422  LOAD_ROW_PARTIAL_1(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
423  VLOAD_PARTIAL(N0, LOAD_N0) \
424  (BASENAME##1, 0, (__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y + Z##1));
425 
426 #define LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
427  LOAD_ROW_PARTIAL_2(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
428  VLOAD_PARTIAL(N0, LOAD_N0) \
429  (BASENAME##2, 0, (__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y + Z##2));
430 
431 #define LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
432  LOAD_ROW_PARTIAL_3(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
433  VLOAD_PARTIAL(N0, LOAD_N0) \
434  (BASENAME##3, 0, (__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y + Z##3));
435 
436 #define LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
437  LOAD_ROW_PARTIAL_4(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
438  VLOAD_PARTIAL(N0, LOAD_N0) \
439  (BASENAME##4, 0, (__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y + Z##4));
440 
441 #define LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
442  LOAD_ROW_PARTIAL_5(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
443  VLOAD_PARTIAL(N0, LOAD_N0) \
444  (BASENAME##5, 0, (__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y + Z##5));
445 
446 #define LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
447  LOAD_ROW_PARTIAL_6(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
448  VLOAD_PARTIAL(N0, LOAD_N0) \
449  (BASENAME##6, 0, (__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y + Z##6));
450 
451 #define LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
452  LOAD_ROW_PARTIAL_7(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
453  VLOAD_PARTIAL(N0, LOAD_N0) \
454  (BASENAME##7, 0, (__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y + Z##7));
455 
456 #define LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
457  LOAD_ROW_PARTIAL_8(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
458  VLOAD_PARTIAL(N0, LOAD_N0) \
459  (BASENAME##8, 0, (__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y + Z##8));
460 
461 #define LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
462  LOAD_ROW_PARTIAL_9(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
463  VLOAD_PARTIAL(N0, LOAD_N0) \
464  (BASENAME##9, 0, (__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y + Z##9));
465 
466 #define LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
467  LOAD_ROW_PARTIAL_10(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
468  VLOAD_PARTIAL(N0, LOAD_N0) \
469  (BASENAME##A, 0, (__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y + Z##A));
470 
471 #define LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
472  LOAD_ROW_PARTIAL_11(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
473  VLOAD_PARTIAL(N0, LOAD_N0) \
474  (BASENAME##B, 0, (__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y + Z##B));
475 
476 #define LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
477  LOAD_ROW_PARTIAL_12(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
478  VLOAD_PARTIAL(N0, LOAD_N0) \
479  (BASENAME##C, 0, (__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y + Z##C));
480 
481 #define LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
482  LOAD_ROW_PARTIAL_13(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
483  VLOAD_PARTIAL(N0, LOAD_N0) \
484  (BASENAME##D, 0, (__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y + Z##D));
485 
486 #define LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
487  LOAD_ROW_PARTIAL_14(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
488  VLOAD_PARTIAL(N0, LOAD_N0) \
489  (BASENAME##E, 0, (__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y + Z##E));
490 
491 #define LOAD_ROW_PARTIAL_16(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
492  LOAD_ROW_PARTIAL_15(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) \
493  VLOAD_PARTIAL(N0, LOAD_N0) \
494  (BASENAME##F, 0, (__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y + Z##F));
495 /** @} */ // end of groupd LOAD_ROW_PARTIAL_n
496 
497 /** Partially load a block of the given size LOAD_M0xLOAD_N0
498  * @name LOAD_BLOCK_PARTIAL
499  *
500  * @note The vector width @p N0 is also required for correct partial storing behaviour.
501  * @note in case @p LOAD_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
502  *
503  * The data to load is expected to have consecutive names for each row.
504  * E.g., for LOAD_M0=3 and basename=c, the expected names are c0, c1 and c2.
505  * The Z offset is expected to have consecutive names.
506  * E.g., for LOAD_M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
507  *
508  * @param[in] LOAD_M0 The number of rows to load. Supported: 1-16
509  * @param[in] LOAD_N0 The lower number of elements of vectors to load. Supported: 1-16 and <= @p N0
510  * @param[in] N0 The size of each vector. Supported: 1, 2, 3, 4, 8, 16
511  * @param[in] DATA_TYPE The data type of the vectors
512  * @param[in] BASENAME The basename of the variables
513  * @param[in] PTR The base pointer
514  * @param[in] OFFSET The offset within a row
515  * @param[in] STRIDE_Y The stride value in y-axis direction
516  * @param[in] Z The offset in z-axis direction
517  * @{
518  */
519 #define LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_ROW_PARTIAL_##LOAD_M0(N0, LOAD_N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
520 #define LOAD_BLOCK_PARTIAL(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z) LOAD_BLOCK_PARTIAL_STR(LOAD_M0, LOAD_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
521 /** Load a block that can be partial in both x and y dimensions
522  *
523  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
524  *
525  * The data to load is expected to have consecutive names for each row.
526  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
527  * The Z offset is expected to have consecutive names.
528  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
529  *
530  * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16
531  * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
532  * @param[in] DATA_TYPE The data type of the vectors
533  * @param[in] BASENAME The basename of the variables
534  * @param[in] PTR The base pointer
535  * @param[in] OFFSET The offset within a row
536  * @param[in] STRIDE_Y The stride value in y-axis direction
537  * @param[in] Z The offset in z-axis direction
538  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
539  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
540  * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
541  * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
542  */
543 #define LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
544  if(!(PARTIAL_COND_X) && !(PARTIAL_COND_Y)) \
545  { \
546  LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
547  } \
548  else if((PARTIAL_COND_Y) && !(PARTIAL_COND_X)) \
549  { \
550  LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
551  } \
552  else if(!(PARTIAL_COND_Y) && (PARTIAL_COND_X)) \
553  { \
554  LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
555  } \
556  else \
557  { \
558  LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
559  }
560 /** Load a block that can only be partial in x but not y.
561  *
562  * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
563  *
564  * The data to load is expected to have consecutive names for each row.
565  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
566  * The Z offset is expected to have consecutive names.
567  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
568  *
569  * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16
570  * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
571  * @param[in] DATA_TYPE The data type of the vectors
572  * @param[in] BASENAME The basename of the variables
573  * @param[in] PTR The base pointer
574  * @param[in] OFFSET The offset within a row
575  * @param[in] STRIDE_Y The stride value in y-axis direction
576  * @param[in] Z The offset in z-axis direction
577  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported range: [1, @p N0)
578  * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
579  */
580 #define LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X) \
581  if(!(PARTIAL_COND_X)) \
582  { \
583  LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
584  } \
585  else \
586  { \
587  LOAD_BLOCK_PARTIAL(M0, PARTIAL_STORE_N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
588  }
589 /** Load a block that can only be partial in y but not x.
590  *
591  * @note in case @p N0 or @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
592  *
593  * The data to store is expected to have consecutive names for each row.
594  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
595  * The Z offset is expected to have consecutive names.
596  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
597  *
598  * @param[in] M0 The number of rows to store, for non-partial blocks. Supported: 1-16
599  * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
600  * @param[in] DATA_TYPE The data type of the vectors
601  * @param[in] BASENAME The basename of the variables
602  * @param[in] PTR The base pointer
603  * @param[in] OFFSET The offset within a row
604  * @param[in] STRIDE_Y The stride value in y-axis direction
605  * @param[in] Z The offset in z-axis direction
606  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported range: [1, @p M0)
607  * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial store Y. True to use PARTIAL_STORE_M0 rather than M0.
608  */
609 #define LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y) \
610  if(!(PARTIAL_COND_Y)) \
611  { \
612  LOAD_BLOCK_PARTIAL(M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
613  } \
614  else \
615  { \
616  LOAD_BLOCK_PARTIAL(PARTIAL_STORE_M0, N0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z); \
617  }
618 /** @} */ // end of group LOAD_BLOCK_PARTIAL
619 /** Boundary-aware GeMM block load
620  * @name LOAD_BLOCK_BOUNDARY_AWARE
621  * This macro assumes the following schemes to achieve boundary-awareness:
622  * - Overlapping load in Y axis from lhs tensor. This implies lhs has no padding along y dim.
623  * - Non-Overlapping(normal) load from rhs tensor. This imples rhs can have paddings.
624  * - Overlapping load in Y axis from bias tensor. This implies rhs has no padding along y dim.
625  * The macro then ensures that the src tensor can be loaded without any paddings in both x and y dim.
626  *
627  * In the y dimension, we place the partial blocks **at the beginning** while in the x dimension, we place the partial
628  * blocks **at the end**.
629  * Say, the src tensor is of shape MxN and we have M0 and N0 as the block size, this is how we define "partial blocks"/
630  * "boundary block" (we use the 2 terms "partial blocks" and "boundary blocks" interchangeably) and its various parameters:
631  *
632  * *--x--> x == 0 x == 1
633  * | |<------------------------------N-------------------------->|
634  * y |<--------------N0------------->|<----PARTIAL_STORE_N0----->|
635  * | -------------#############################################################
636  * * | | |...............................|...........................|
637  * y == 0 | PAR_..._M0 |......Boundary block in y......|.Boundary block in x and y.|
638  * | | |...............................|...........................|
639  * M --#############################################################
640  * | | | |...........................|
641  * y == 1 | M0 | Non-boundary block |....Boundary block in x....|
642  * | | | |...........................|
643  * |------------#############################################################
644  *
645  * Then @p PARTIAL_STORE_M0 = M % M0 and @p PARTIAL_STORE_N0 = N % N0
646  *
647  * @note in cases @p PARTIAL_STORE_N0 != 1, 2, 3, 4, 8, 16, extra vload(s) will be invoked, thus incurring small performance penalty.
648  *
649  * It automatically detects if a giving M,N,M0,N0 combination can yield partial blocks in either X and Y dimension,
650  * and select corresponding load methods such that the boundary detection logic is only added when needed.
651  *
652  * The data to load is expected to have consecutive names for each row.
653  * E.g., for M0=3 and basename=c, the expected names are c0, c1 and c2.
654  * The Z offset is expected to have consecutive names.
655  * E.g., for M0=3 and Z=zin, the expected z offset names are zin0, zin1 and zin2.
656  *
657  * The macro will result in a declaration of @p M0 vectors of size @p N0 with data
658  * type @p DATA_TYPE containing values partially loaded from the specified
659  * address in memory. The remaining (N0 - PARTIAL_STORE_N0) elements will be
660  * filled with zeros.
661  *
662  * @param[in] M0 The number of rows to load, for non-partial blocks. Supported: 1-16
663  * @param[in] N0 The size of each vector, for non-partial blocks. Supported: 1, 2, 3, 4, 8, 16
664  * @param[in] DATA_TYPE The data type of the vectors
665  * @param[in] BASENAME The basename of the variables
666  * @param[in] PTR The base pointer
667  * @param[in] OFFSET The offset within a row
668  * @param[in] STRIDE_Y The stride value in y-axis direction
669  * @param[in] Z The offset in z-axis direction
670  * @param[in] PARTIAL_STORE_M0 The partial size in y, for partial blocks. Supported: [0, @p M0)
671  * @param[in] PARTIAL_STORE_N0 The partial size in x, for partial blocks. Supported: [0, @p N0)
672  * @param[in] PARTIAL_COND_Y Condition on the y axis to perform the partial load Y. True to use PARTIAL_STORE_M0 rather than M0.
673  * @param[in] PARTIAL_COND_X Condition on the x axis to perform the partial load X. True to use PARTIAL_STORE_N0 rather than N0.
674  * @{
675  */
676 #if PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
677 // Case1: No partial blocks in either x or y
678 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
679  LOAD_BLOCK(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z)
680 
681 #elif PARTIAL_STORE_M0 > 0 && PARTIAL_STORE_N0 == 0
682 // Case2: Partial blocks in y
683 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
684  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
685  LOAD_BLOCK_PARTIAL_IN_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_COND_Y)
686 
687 #elif PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 > 0
688 // Case3: Partial blocks in x
689 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
690  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
691  LOAD_BLOCK_PARTIAL_IN_X(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_N0, PARTIAL_COND_X)
692 
693 #else // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
694 // Case4: Partial blocks in both x and y
695 #define LOAD_BLOCK_BOUNDARY_AWARE(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X) \
696  REPEAT_VAR_INIT_TO_CONST(M0, VEC_DATA_TYPE(DATA_TYPE, N0), BASENAME, 0); \
697  LOAD_BLOCK_PARTIAL_IN_X_AND_Y(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Z, PARTIAL_STORE_M0, PARTIAL_STORE_N0, PARTIAL_COND_Y, PARTIAL_COND_X)
698 
699 #endif // PARTIAL_STORE_M0 == 0 && PARTIAL_STORE_N0 == 0
700 
701 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
702  * @name LOAD_TEXTURE2D_ROW_n
703  *
704  * @param[in] N0 The number of pixels to read
705  * @param[in] DATA_TYPE The data type of variables
706  * @param[in] BASENAME The basename of the destination variables for the loaded rows
707  * @param[in] IMG The 2D OpenCL image object
708  * @param[in] X_COORD The x coordinate for the top-left pixel
709  * @param[in] Y_COORD The y coordinate for the top-left pixel
710  * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
711  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
712  * @{
713  */
714 #define LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
715  BASENAME##0 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 0 * X_STEP_ROW), (Y_COORD + 0 * Y_STEP_ROW))
716 
717 #define LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
718  LOAD_TEXTURE2D_ROW_1(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
719  BASENAME##1 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 1 * X_STEP_ROW), (Y_COORD + 1 * Y_STEP_ROW))
720 
721 #define LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
722  LOAD_TEXTURE2D_ROW_2(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
723  BASENAME##2 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 2 * X_STEP_ROW), (Y_COORD + 2 * Y_STEP_ROW))
724 
725 #define LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
726  LOAD_TEXTURE2D_ROW_3(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
727  BASENAME##3 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 3 * X_STEP_ROW), (Y_COORD + 3 * Y_STEP_ROW))
728 
729 #define LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
730  LOAD_TEXTURE2D_ROW_4(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
731  BASENAME##4 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 4 * X_STEP_ROW), (Y_COORD + 4 * Y_STEP_ROW))
732 
733 #define LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
734  LOAD_TEXTURE2D_ROW_5(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
735  BASENAME##5 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 5 * X_STEP_ROW), (Y_COORD + 5 * Y_STEP_ROW))
736 
737 #define LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
738  LOAD_TEXTURE2D_ROW_6(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
739  BASENAME##6 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 6 * X_STEP_ROW), (Y_COORD + 6 * Y_STEP_ROW))
740 
741 #define LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
742  LOAD_TEXTURE2D_ROW_7(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
743  BASENAME##7 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 7 * X_STEP_ROW), (Y_COORD + 7 * Y_STEP_ROW))
744 
745 #define LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
746  LOAD_TEXTURE2D_ROW_8(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
747  BASENAME##8 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 8 * X_STEP_ROW), (Y_COORD + 8 * Y_STEP_ROW))
748 
749 #define LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
750  LOAD_TEXTURE2D_ROW_9(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
751  BASENAME##9 = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 9 * X_STEP_ROW), (Y_COORD + 9 * Y_STEP_ROW))
752 
753 #define LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
754  LOAD_TEXTURE2D_ROW_10(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
755  BASENAME##A = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 10 * X_STEP_ROW), (Y_COORD + 10 * Y_STEP_ROW))
756 
757 #define LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
758  LOAD_TEXTURE2D_ROW_11(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
759  BASENAME##B = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 11 * X_STEP_ROW), (Y_COORD + 11 * Y_STEP_ROW))
760 
761 #define LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
762  LOAD_TEXTURE2D_ROW_12(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
763  BASENAME##C = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 12 * X_STEP_ROW), (Y_COORD + 12 * Y_STEP_ROW))
764 
765 #define LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
766  LOAD_TEXTURE2D_ROW_13(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
767  BASENAME##D = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 13 * X_STEP_ROW), (Y_COORD + 13 * Y_STEP_ROW))
768 
769 #define LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
770  LOAD_TEXTURE2D_ROW_14(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
771  BASENAME##E = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 14 * X_STEP_ROW), (Y_COORD + 14 * Y_STEP_ROW))
772 
773 #define LOAD_TEXTURE2D_ROW_16(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
774  LOAD_TEXTURE2D_ROW_15(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) \
775  BASENAME##F = READ_IMAGE2D(DATA_TYPE, N0, IMG, (X_COORD + 15 * X_STEP_ROW), (Y_COORD + 15 * Y_STEP_ROW))
776 /** @} */ // end of group LOAD_TEXTURE2D_ROW_n
777 
778 /** Load a 2D texture in unit of pixel. A pixel is made of 4 floating point values
779  * @name LOAD_TEXTURE2D
780  *
781  * Supported cases are M0=1,2,3,...,16 and N0=1
782  * The data to load is expected to have consecutive names for each row.
783  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
784  *
785  * @param[in] M0 The number of consecutive rows
786  * @param[in] N0 The number of consecutive pixels. Only 1, 2 and 4 are supported
787  * @param[in] DATA_TYPE The data type of the target
788  * @param[in] BASENAME The basename of the result variables
789  * @param[in] IMG The 2D OpenCL image object
790  * @param[in] X_COORD The x coordinate for the top-left pixel
791  * @param[in] Y_COORD The y coordinate for the top-left pixel
792  * @param[in] X_STEP_ROW The incremental step row for the x coordinate (in pixels)
793  * @param[in] Y_STEP_ROW The incremental step row for the y coordinate (in pixels)
794  * @{
795  */
796 #define LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_ROW_##M0(N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
797 #define LOAD_TEXTURE2D(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW) LOAD_TEXTURE2D_STR(M0, N0, DATA_TYPE, BASENAME, IMG, X_COORD, Y_COORD, X_STEP_ROW, Y_STEP_ROW)
798 /** @} */ // end of group LOAD_TEXTURE2D
799 
800 /** Loads the rows from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1) passing the Y index for each row to be loaded.
801  * @name LOAD_ROW_INDIRECT_n
802  *
803  * @param[in] N0 The number of columns to load
804  * @param[in] DATA_TYPE The data type of variables
805  * @param[in] BASENAME The basename of the destination variables for the loaded rows
806  * @param[in] PTR The base pointer
807  * @param[in] OFFSET The offset within a row
808  * @param[in] STRIDE_Y The stride value in y-axis direction
809  * @param[in] Y The y-axis offset vector
810  * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0
811  * @{
812  */
813 #define LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
814  VEC_DATA_TYPE(DATA_TYPE, N0) \
815  BASENAME##0; \
816  if(Y_MASK##0 != 0) \
817  BASENAME##0 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##0 * STRIDE_Y)); \
818  else \
819  BASENAME##0 = 0;
820 
821 #define LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
822  LOAD_ROW_INDIRECT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
823  VEC_DATA_TYPE(DATA_TYPE, N0) \
824  BASENAME##1; \
825  if(Y_MASK##1 != 0) \
826  BASENAME##1 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##1 * STRIDE_Y)); \
827  else \
828  BASENAME##1 = 0;
829 
830 #define LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
831  LOAD_ROW_INDIRECT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
832  VEC_DATA_TYPE(DATA_TYPE, N0) \
833  BASENAME##2; \
834  if(Y_MASK##2 != 0) \
835  BASENAME##2 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##2 * STRIDE_Y)); \
836  else \
837  BASENAME##2 = 0;
838 
839 #define LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
840  LOAD_ROW_INDIRECT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
841  VEC_DATA_TYPE(DATA_TYPE, N0) \
842  BASENAME##3; \
843  if(Y_MASK##3 != 0) \
844  BASENAME##3 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##3 * STRIDE_Y)); \
845  else \
846  BASENAME##3 = 0;
847 
848 #define LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
849  LOAD_ROW_INDIRECT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
850  VEC_DATA_TYPE(DATA_TYPE, N0) \
851  BASENAME##4; \
852  if(Y_MASK##4 != 0) \
853  BASENAME##4 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##4 * STRIDE_Y)); \
854  else \
855  BASENAME##4 = 0;
856 
857 #define LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
858  LOAD_ROW_INDIRECT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
859  VEC_DATA_TYPE(DATA_TYPE, N0) \
860  BASENAME##5; \
861  if(Y_MASK##5 != 0) \
862  BASENAME##5 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##5 * STRIDE_Y)); \
863  else \
864  BASENAME##5 = 0;
865 
866 #define LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
867  LOAD_ROW_INDIRECT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
868  VEC_DATA_TYPE(DATA_TYPE, N0) \
869  BASENAME##6; \
870  if(Y_MASK##6 != 0) \
871  BASENAME##6 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##6 * STRIDE_Y)); \
872  else \
873  BASENAME##6 = 0;
874 
875 #define LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
876  LOAD_ROW_INDIRECT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
877  VEC_DATA_TYPE(DATA_TYPE, N0) \
878  BASENAME##7; \
879  if(Y_MASK##7 != 0) \
880  BASENAME##7 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##7 * STRIDE_Y)); \
881  else \
882  BASENAME##7 = 0;
883 
884 #define LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
885  LOAD_ROW_INDIRECT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
886  VEC_DATA_TYPE(DATA_TYPE, N0) \
887  BASENAME##8; \
888  if(Y_MASK##8 != 0) \
889  BASENAME##8 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##8 * STRIDE_Y)); \
890  else \
891  BASENAME##8 = 0;
892 
893 #define LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
894  LOAD_ROW_INDIRECT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
895  VEC_DATA_TYPE(DATA_TYPE, N0) \
896  BASENAME##9; \
897  if(Y_MASK##9 != 0) \
898  BASENAME##9 = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##9 * STRIDE_Y)); \
899  else \
900  BASENAME##9 = 0;
901 
902 #define LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
903  LOAD_ROW_INDIRECT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
904  VEC_DATA_TYPE(DATA_TYPE, N0) \
905  BASENAME##A; \
906  if(Y_MASK##A != 0) \
907  BASENAME##A = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##A * STRIDE_Y)); \
908  else \
909  BASENAME##A = 0;
910 
911 #define LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
912  LOAD_ROW_INDIRECT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
913  VEC_DATA_TYPE(DATA_TYPE, N0) \
914  BASENAME##B; \
915  if(Y_MASK##B != 0) \
916  BASENAME##B = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##B * STRIDE_Y)); \
917  else \
918  BASENAME##B = 0;
919 
920 #define LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
921  LOAD_ROW_INDIRECT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
922  VEC_DATA_TYPE(DATA_TYPE, N0) \
923  BASENAME##C; \
924  if(Y_MASK##C != 0) \
925  BASENAME##C = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##C * STRIDE_Y)); \
926  else \
927  BASENAME##C = 0;
928 
929 #define LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
930  LOAD_ROW_INDIRECT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
931  VEC_DATA_TYPE(DATA_TYPE, N0) \
932  BASENAME##D; \
933  if(Y_MASK##D != 0) \
934  BASENAME##D = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##D * STRIDE_Y)); \
935  else \
936  BASENAME##D = 0;
937 
938 #define LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
939  LOAD_ROW_INDIRECT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
940  VEC_DATA_TYPE(DATA_TYPE, N0) \
941  BASENAME##E; \
942  if(Y_MASK##E != 0) \
943  BASENAME##E = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##E * STRIDE_Y)); \
944  else \
945  BASENAME##E = 0;
946 
947 #define LOAD_ROW_INDIRECT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
948  LOAD_ROW_INDIRECT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) \
949  VEC_DATA_TYPE(DATA_TYPE, N0) \
950  BASENAME##F; \
951  if(Y_MASK##F != 0) \
952  BASENAME##F = VLOAD(N0)(0, (__global DATA_TYPE *)(PTR + OFFSET + Y##F * STRIDE_Y)); \
953  else \
954  BASENAME##F = 0;
955 
956 /** Load blocks (consecutive rows and columns) with Y offset.
957  * @name LOAD_BLOCK_INDIRECT
958  *
959  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
960  * The data to load is expected to have consecutive names for each row.
961  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
962  * The Z offset is expected to have consecutive names.
963  * E.g., for M0=3, and Z=zin, the expected Z offsets are zin0, zin1 and zin2.
964  *
965  * @param[in] M0 The number of consecutive rows
966  * @param[in] N0 The number of consecutive columns
967  * @param[in] DATA_TYPE The data type of the target
968  * @param[in] BASENAME The basename of the result variables
969  * @param[in] PTR The base pointer for the data
970  * @param[in] OFFSET The offset within a row
971  * @param[in] STRIDE_Y The stride in y-axis direction
972  * @param[in] Y The y-axis offset vector
973  * @param[in] Y_MASK The y-axis mask vector. If 0, forces BASENAMEn to 0
974  * @{
975  */
976 #define LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_ROW_INDIRECT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
977 #define LOAD_BLOCK_INDIRECT(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK) LOAD_BLOCK_INDIRECT_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y, Y, Y_MASK)
978 
979 /** Loads the elements from 0 to n-1 in the given variables (BASENAME0 to BASENAMEn-1).
980  * @name LOAD_ELEMENT_n
981  *
982  * @param[in] N0 The number of rows to load
983  * @param[in] DATA_TYPE The data type of variables
984  * @param[in] BASENAME The basename of the destination variables for the loaded rows
985  * @param[in] PTR The base pointer
986  * @param[in] OFFSET The offset within a row
987  * @param[in] STRIDE_Y The stride value in y-axis direction
988  * @{
989  */
990 #define LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
991  VEC_DATA_TYPE(DATA_TYPE, N0) \
992  BASENAME##0 = *((__global DATA_TYPE *)(PTR + OFFSET + 0 * STRIDE_Y));
993 
994 #define LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
995  LOAD_ELEMENT_1(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
996  VEC_DATA_TYPE(DATA_TYPE, N0) \
997  BASENAME##1 = *((__global DATA_TYPE *)(PTR + OFFSET + 1 * STRIDE_Y));
998 
999 #define LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1000  LOAD_ELEMENT_2(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1001  VEC_DATA_TYPE(DATA_TYPE, N0) \
1002  BASENAME##2 = *((__global DATA_TYPE *)(PTR + OFFSET + 2 * STRIDE_Y));
1003 
1004 #define LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1005  LOAD_ELEMENT_3(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1006  VEC_DATA_TYPE(DATA_TYPE, N0) \
1007  BASENAME##3 = *((__global DATA_TYPE *)(PTR + OFFSET + 3 * STRIDE_Y));
1008 
1009 #define LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1010  LOAD_ELEMENT_4(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1011  VEC_DATA_TYPE(DATA_TYPE, N0) \
1012  BASENAME##4 = *((__global DATA_TYPE *)(PTR + OFFSET + 4 * STRIDE_Y));
1013 
1014 #define LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1015  LOAD_ELEMENT_5(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1016  VEC_DATA_TYPE(DATA_TYPE, N0) \
1017  BASENAME##5 = *((__global DATA_TYPE *)(PTR + OFFSET + 5 * STRIDE_Y));
1018 
1019 #define LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1020  LOAD_ELEMENT_6(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1021  VEC_DATA_TYPE(DATA_TYPE, N0) \
1022  BASENAME##6 = *((__global DATA_TYPE *)(PTR + OFFSET + 6 * STRIDE_Y));
1023 
1024 #define LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1025  LOAD_ELEMENT_7(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1026  VEC_DATA_TYPE(DATA_TYPE, N0) \
1027  BASENAME##7 = *((__global DATA_TYPE *)(PTR + OFFSET + 7 * STRIDE_Y));
1028 
1029 #define LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1030  LOAD_ELEMENT_8(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1031  VEC_DATA_TYPE(DATA_TYPE, N0) \
1032  BASENAME##8 = *((__global DATA_TYPE *)(PTR + OFFSET + 8 * STRIDE_Y));
1033 
1034 #define LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1035  LOAD_ELEMENT_9(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1036  VEC_DATA_TYPE(DATA_TYPE, N0) \
1037  BASENAME##9 = *((__global DATA_TYPE *)(PTR + OFFSET + 9 * STRIDE_Y));
1038 
1039 #define LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1040  LOAD_ELEMENT_10(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1041  VEC_DATA_TYPE(DATA_TYPE, N0) \
1042  BASENAME##A = *((__global DATA_TYPE *)(PTR + OFFSET + 10 * STRIDE_Y));
1043 
1044 #define LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1045  LOAD_ELEMENT_11(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1046  VEC_DATA_TYPE(DATA_TYPE, N0) \
1047  BASENAME##B = *((__global DATA_TYPE *)(PTR + OFFSET + 11 * STRIDE_Y));
1048 
1049 #define LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1050  LOAD_ELEMENT_12(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1051  VEC_DATA_TYPE(DATA_TYPE, N0) \
1052  BASENAME##C = *((__global DATA_TYPE *)(PTR + OFFSET + 12 * STRIDE_Y));
1053 
1054 #define LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1055  LOAD_ELEMENT_13(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1056  VEC_DATA_TYPE(DATA_TYPE, N0) \
1057  BASENAME##D = *((__global DATA_TYPE *)(PTR + OFFSET + 13 * STRIDE_Y));
1058 
1059 #define LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1060  LOAD_ELEMENT_14(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1061  VEC_DATA_TYPE(DATA_TYPE, N0) \
1062  BASENAME##E = *((__global DATA_TYPE *)(PTR + OFFSET + 14 * STRIDE_Y));
1063 
1064 #define LOAD_ELEMENT_16(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1065  LOAD_ELEMENT_15(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) \
1066  VEC_DATA_TYPE(DATA_TYPE, N0) \
1067  BASENAME##F = *((__global DATA_TYPE *)(PTR + OFFSET + 15 * STRIDE_Y));
1068 
1069 /** @}*/ // end of group LOAD_ELEMENT_n
1070 
1071 /** Load Scalar as Vector (consecutive elements).
1072  * @name LOAD_SCALAR_AS_VECTOR
1073  *
1074  * Supported cases are M0=1,2,3,...,16 and N0=1,2,3,4,8,16
1075  * The data to load is expected to have consecutive names for each row.
1076  * E.g., for M0=3, and BASENAME=c, the expected data is c0, c1 and c2.
1077  *
1078  * @param[in] M0 The number of consecutive rows
1079  * @param[in] N0 The number of consecutive columns
1080  * @param[in] DATA_TYPE The data type of the target
1081  * @param[in] BASENAME The basename of the result variables
1082  * @param[in] PTR The base pointer for the data
1083  * @param[in] OFFSET The offset within a row
1084  * @param[in] STRIDE_Y The stride in y-axis direction
1085  * @{
1086  */
1087 #define LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_ELEMENT_##M0(N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
1088 #define LOAD_SCALAR_AS_VECTOR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y) LOAD_SCALAR_AS_VECTOR_STR(M0, N0, DATA_TYPE, BASENAME, PTR, OFFSET, STRIDE_Y)
1089 /** @} */ // end of group LOAD_SCALAR_AS_VECTOR
1090 
1091 /** Basic macros to calculate Z offset values from Z0 to Zn-1
1092  * @name CALCULATE_Z_OFFSET_n
1093  *
1094  * @param[in] M0 The number of offset values to calculate
1095  * @param[in] DATA_TYPE The data type of the results
1096  * @param[in] Z The basename of the result variables
1097  * @param[in] Y The work-itme ID of y-axis
1098  * @param[in] HEIGHT_GEMM3D The height of GEMM3D
1099  * @param[in] DEPTH_GEMM3D The depth of GEMM3D
1100  * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
1101  * @param[in] STRIDE_Y The stride value in y-axis direction
1102  *
1103  * @{
1104  */
1105 #define CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1106  Z##0 = (0 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1107  Z##0 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##0); \
1108  Z##0 *= (CROSS_PLANE_PAD * STRIDE_Y);
1109 
1110 #define CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1111  CALCULATE_Z_OFFSET_1(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1112  Z##1 = (1 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1113  Z##1 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##1); \
1114  Z##1 *= (CROSS_PLANE_PAD * STRIDE_Y);
1115 
1116 #define CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1117  CALCULATE_Z_OFFSET_2(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1118  Z##2 = (2 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1119  Z##2 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##2); \
1120  Z##2 *= (CROSS_PLANE_PAD * STRIDE_Y);
1121 
1122 #define CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1123  CALCULATE_Z_OFFSET_3(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1124  Z##3 = (3 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1125  Z##3 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##3); \
1126  Z##3 *= (CROSS_PLANE_PAD * STRIDE_Y);
1127 
1128 #define CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1129  CALCULATE_Z_OFFSET_4(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1130  Z##4 = (4 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1131  Z##4 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##4); \
1132  Z##4 *= (CROSS_PLANE_PAD * STRIDE_Y);
1133 
1134 #define CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1135  CALCULATE_Z_OFFSET_5(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1136  Z##5 = (5 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1137  Z##5 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##5); \
1138  Z##5 *= (CROSS_PLANE_PAD * STRIDE_Y);
1139 
1140 #define CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1141  CALCULATE_Z_OFFSET_6(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1142  Z##6 = (6 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1143  Z##6 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##6); \
1144  Z##6 *= (CROSS_PLANE_PAD * STRIDE_Y);
1145 
1146 #define CALCULATE_Z_OFFSET_8(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1147  CALCULATE_Z_OFFSET_7(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) \
1148  Z##7 = (7 + (DATA_TYPE)(Y)) / (DATA_TYPE)HEIGHT_GEMM3D; \
1149  Z##7 = min((DATA_TYPE)(DEPTH_GEMM3D - 1), Z##7); \
1150  Z##7 *= (CROSS_PLANE_PAD * STRIDE_Y);
1151 
1152 /** @} */ // end of group CALCULATE_Z_OFFSET_n
1153 
1154 /** Calculate Z offset values from Z0 to Zn-1
1155  * @name CALCULATE_Z_OFFSET
1156  *
1157  * The Z offsets are expected to have consecutive names.
1158  * E.g., for M0=3 and Z=zin, the expected names of Z offsets are zin1, zin2, zin3.
1159  * Note that, CROSS_PLANE_PAD (cross plain padding) is required to take into account
1160  * the possible cross plane paddings in case of the plance changes across the z-dimension.
1161  *
1162  * <!--
1163  * | |
1164  * | plane0 |
1165  * | |
1166  * |__________________|
1167  * |******************|
1168  * | cross_plane_pad |
1169  * |******************|
1170  * | |
1171  * | plane1 |
1172  * | |
1173  * |__________________|
1174  * -->
1175  *
1176  * @param[in] M0 The number of offset values to calculate
1177  * @param[in] DATA_TYPE The data type of the results
1178  * @param[in] Z The basename of the result variables
1179  * @param[in] Y The work-itme ID of y-axis
1180  * @param[in] HEIGHT_GEMM3D The height of GEMM3D
1181  * @param[in] DEPTH_GEMM3D The depth of GEMM3D
1182  * @param[in] CROSS_PLANE_PAD The padding required for plane changes accross the z-dimension
1183  * @param[in] STRIDE_Y The stride value in y-axis direction
1184  * @{
1185  */
1186 #define CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_##M0(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
1187 #define CALCULATE_Z_OFFSET(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y) CALCULATE_Z_OFFSET_STR(M0, DATA_TYPE, Z, Y, HEIGHT_GEMM3D, DEPTH_GEMM3D, CROSS_PLANE_PAD, STRIDE_Y)
1188 /** @} */ // end of group CALCULATE_Z_OFFSET
1189 
1190 /** Scale the rows in the given variables (BASENAME0 to BASENAMEn-1)
1191  * @name SCALE_ROW_n
1192  *
1193  * @param[in] DATA_TYPE The data type of the variables
1194  * @param[in] BASENAME The basename of the variables
1195  * @param[in] SCALE The scale factor
1196  * @{
1197  */
1198 #define SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
1199  BASENAME##0 *= (DATA_TYPE)SCALE;
1200 
1201 #define SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
1202  SCALE_ROW_1(DATA_TYPE, BASENAME, SCALE) \
1203  BASENAME##1 *= (DATA_TYPE)SCALE;
1204 
1205 #define SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
1206  SCALE_ROW_2(DATA_TYPE, BASENAME, SCALE) \
1207  BASENAME##2 *= (DATA_TYPE)SCALE;
1208 
1209 #define SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
1210  SCALE_ROW_3(DATA_TYPE, BASENAME, SCALE) \
1211  BASENAME##3 *= (DATA_TYPE)SCALE;
1212 
1213 #define SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
1214  SCALE_ROW_4(DATA_TYPE, BASENAME, SCALE) \
1215  BASENAME##4 *= (DATA_TYPE)SCALE;
1216 
1217 #define SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
1218  SCALE_ROW_5(DATA_TYPE, BASENAME, SCALE) \
1219  BASENAME##5 *= (DATA_TYPE)SCALE;
1220 
1221 #define SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
1222  SCALE_ROW_6(DATA_TYPE, BASENAME, SCALE) \
1223  BASENAME##6 *= (DATA_TYPE)SCALE;
1224 
1225 #define SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
1226  SCALE_ROW_7(DATA_TYPE, BASENAME, SCALE) \
1227  BASENAME##7 *= (DATA_TYPE)SCALE;
1228 
1229 #define SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
1230  SCALE_ROW_8(DATA_TYPE, BASENAME, SCALE) \
1231  BASENAME##8 *= (DATA_TYPE)SCALE;
1232 
1233 #define SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
1234  SCALE_ROW_9(DATA_TYPE, BASENAME, SCALE) \
1235  BASENAME##9 *= (DATA_TYPE)SCALE;
1236 
1237 #define SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
1238  SCALE_ROW_10(DATA_TYPE, BASENAME, SCALE) \
1239  BASENAME##A *= (DATA_TYPE)SCALE;
1240 
1241 #define SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
1242  SCALE_ROW_11(DATA_TYPE, BASENAME, SCALE) \
1243  BASENAME##B *= (DATA_TYPE)SCALE;
1244 
1245 #define SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
1246  SCALE_ROW_12(DATA_TYPE, BASENAME, SCALE) \
1247  BASENAME##C *= (DATA_TYPE)SCALE;
1248 
1249 #define SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
1250  SCALE_ROW_13(DATA_TYPE, BASENAME, SCALE) \
1251  BASENAME##D *= (DATA_TYPE)SCALE;
1252 
1253 #define SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
1254  SCALE_ROW_14(DATA_TYPE, BASENAME, SCALE) \
1255  BASENAME##E *= (DATA_TYPE)SCALE;
1256 
1257 #define SCALE_ROW_16(DATA_TYPE, BASENAME, SCALE) \
1258  SCALE_ROW_15(DATA_TYPE, BASENAME, SCALE) \
1259  BASENAME##F *= (DATA_TYPE)SCALE;
1260 /** @} */ // end of group SCALE_ROW_n
1261 
1262 /** Scale elements stored in a block (BASENAME)
1263  * @name SCALE_BLOCK
1264  *
1265  * Supported cases are N=1,2,3,...,16
1266  *
1267  * @param[in] N The number of rows in the block
1268  * @param[in] DATA_TYPE The data type of the block
1269  * @param[in] BASENAME The basename of the block
1270  * @param[in] SCALE The scale factor
1271  * @{
1272  */
1273 #define SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE) SCALE_ROW_##N(DATA_TYPE, BASENAME, SCALE)
1274 #define SCALE_BLOCK(N, DATA_TYPE, BASENAME, SCALE) SCALE_BLOCK_STR(N, DATA_TYPE, BASENAME, SCALE)
1275 /** @} */ // end of group SCALE_BLOCK
1276 
1277 /** Create a new vector containing the values at the given index for a set of given vectors
1278  * @name COLUMN_VECTORn
1279  *
1280  * @param[in] IDX_COL The index value
1281  * @param[in] BASENAME The basename of the destination vectors
1282  * @param[in] X The basename of the source vectors
1283  * @param[in] TYPE The data type of the destination vectors
1284  * @{
1285  */
1286 #define COLUMN_VECTOR1(IDX_COL, BASENAME, X, TYPE) \
1287  TYPE BASENAME##IDX_COL = (TYPE)((X##0).s##IDX_COL);
1288 #define COLUMN_VECTOR2(IDX_COL, BASENAME, X, TYPE) \
1289  VEC_DATA_TYPE(TYPE, 2) \
1290  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0).s##IDX_COL, (X##1).s##IDX_COL);
1291 #define COLUMN_VECTOR3(IDX_COL, BASENAME, X, TYPE) \
1292  VEC_DATA_TYPE(TYPE, 3) \
1293  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL);
1294 #define COLUMN_VECTOR4(IDX_COL, BASENAME, X, TYPE) \
1295  VEC_DATA_TYPE(TYPE, 4) \
1296  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL);
1297 #define COLUMN_VECTOR8(IDX_COL, BASENAME, X, TYPE) \
1298  VEC_DATA_TYPE(TYPE, 8) \
1299  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL);
1300 #define COLUMN_VECTOR16(IDX_COL, BASENAME, X, TYPE) \
1301  VEC_DATA_TYPE(TYPE, 16) \
1302  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0).s##IDX_COL, (X##1).s##IDX_COL, (X##2).s##IDX_COL, (X##3).s##IDX_COL, (X##4).s##IDX_COL, (X##5).s##IDX_COL, (X##6).s##IDX_COL, (X##7).s##IDX_COL, (X##8).s##IDX_COL, (X##9).s##IDX_COL, (X##A).s##IDX_COL, (X##B).s##IDX_COL, (X##C).s##IDX_COL, (X##D).s##IDX_COL, (X##E).s##IDX_COL, (X##F).s##IDX_COL);
1303 /** @} */ // end of group COLUMN_VECTORn
1304 
1305 /** Create a new vector containing the values at the given index. Utility macros for transposing a colum-vector
1306  * @name COLUMN_VECTOR_SCALARn
1307  *
1308  * @param[in] IDX_COL The index value
1309  * @param[in] BASENAME The basename of the destination vectors
1310  * @param[in] X The basename of the source vectors
1311  * @param[in] TYPE The data type of the destination vectors
1312  * @{
1313  */
1314 #define COLUMN_VECTOR_SCALAR1(IDX_COL, BASENAME, X, TYPE) \
1315  TYPE BASENAME##IDX_COL = (TYPE)((X##0));
1316 #define COLUMN_VECTOR_SCALAR2(IDX_COL, BASENAME, X, TYPE) \
1317  VEC_DATA_TYPE(TYPE, 2) \
1318  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 2))((X##0), (X##1));
1319 #define COLUMN_VECTOR_SCALAR3(IDX_COL, BASENAME, X, TYPE) \
1320  VEC_DATA_TYPE(TYPE, 3) \
1321  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 3))((X##0), (X##1), (X##2));
1322 #define COLUMN_VECTOR_SCALAR4(IDX_COL, BASENAME, X, TYPE) \
1323  VEC_DATA_TYPE(TYPE, 4) \
1324  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 4))((X##0), (X##1), (X##2), (X##3));
1325 #define COLUMN_VECTOR_SCALAR8(IDX_COL, BASENAME, X, TYPE) \
1326  VEC_DATA_TYPE(TYPE, 8) \
1327  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 8))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7));
1328 #define COLUMN_VECTOR_SCALAR16(IDX_COL, BASENAME, X, TYPE) \
1329  VEC_DATA_TYPE(TYPE, 16) \
1330  BASENAME##IDX_COL = (VEC_DATA_TYPE(TYPE, 16))((X##0), (X##1), (X##2), (X##3), (X##4), (X##5), (X##6), (X##7), (X##8), (X##9), (X##A), (X##B), (X##C), (X##D), (X##E), (X##F));
1331 /** @} */ // end of group COLUMN_VECTORn
1332 
1333 /** Create transposed vectors of the given vectors
1334  * @name TRANSPOSE_K0Xn
1335  *
1336  * @param[in] K0 The size of the source vectors
1337  * @param[in] BASENAME The basename of transposed vectors
1338  * @param[in] BS The basename of source vectors for transposition
1339  * @param[in] TYPE The data type of the transposed vectors
1340  * @{
1341  */
1342 #define TRANSPOSE_K0X1(K0, BASENAME, BS, TYPE) \
1343  COLUMN_VECTOR_SCALAR(K0, 0, BASENAME, BS, TYPE);
1344 #define TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE) \
1345  COLUMN_VECTOR(K0, 0, BASENAME, BS, TYPE); \
1346  COLUMN_VECTOR(K0, 1, BASENAME, BS, TYPE);
1347 #define TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE) \
1348  TRANSPOSE_K0X2(K0, BASENAME, BS, TYPE); \
1349  COLUMN_VECTOR(K0, 2, BASENAME, BS, TYPE);
1350 #define TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE) \
1351  TRANSPOSE_K0X3(K0, BASENAME, BS, TYPE); \
1352  COLUMN_VECTOR(K0, 3, BASENAME, BS, TYPE);
1353 #define TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE) \
1354  TRANSPOSE_K0X4(K0, BASENAME, BS, TYPE); \
1355  COLUMN_VECTOR(K0, 4, BASENAME, BS, TYPE); \
1356  COLUMN_VECTOR(K0, 5, BASENAME, BS, TYPE); \
1357  COLUMN_VECTOR(K0, 6, BASENAME, BS, TYPE); \
1358  COLUMN_VECTOR(K0, 7, BASENAME, BS, TYPE);
1359 #define TRANSPOSE_K0X16(K0, BASENAME, BS, TYPE) \
1360  TRANSPOSE_K0X8(K0, BASENAME, BS, TYPE); \
1361  COLUMN_VECTOR(K0, 8, BASENAME, BS, TYPE); \
1362  COLUMN_VECTOR(K0, 9, BASENAME, BS, TYPE); \
1363  COLUMN_VECTOR(K0, A, BASENAME, BS, TYPE); \
1364  COLUMN_VECTOR(K0, B, BASENAME, BS, TYPE); \
1365  COLUMN_VECTOR(K0, C, BASENAME, BS, TYPE); \
1366  COLUMN_VECTOR(K0, D, BASENAME, BS, TYPE); \
1367  COLUMN_VECTOR(K0, E, BASENAME, BS, TYPE); \
1368  COLUMN_VECTOR(K0, F, BASENAME, BS, TYPE);
1369 
1370 /** @} */ // end of group TRANSPOSE_K0Xn
1371 
1372 /** Create column vectors to contain the values at the given index for a set of given vectors
1373  *
1374  * @param[in] K0 The number of source vectors
1375  * @param[in] IDX_COL The index value
1376  * @param[in] BASENAME The basename of the destination vectors
1377  * @param[in] BS The basename of the source vectors
1378  * @param[in] TYPE The data type of the destination vectors
1379  */
1380 #define COLUMN_VECTOR(K0, IDX_COL, BASENAME, BS, TYPE) \
1381  CONCAT(COLUMN_VECTOR, K0) \
1382  (IDX_COL, BASENAME, BS, TYPE);
1383 
1384 /** Create column vectors to contain the values at the given index. Utility macro for transposing a column-vector
1385  *
1386  * @param[in] K0 The number of source vectors
1387  * @param[in] IDX_COL The index value
1388  * @param[in] BASENAME The basename of the destination vectors
1389  * @param[in] BS The basename of the source vectors
1390  * @param[in] TYPE The data type of the destination vectors
1391  */
1392 #define COLUMN_VECTOR_SCALAR(K0, IDX_COL, BASENAME, BS, TYPE) \
1393  CONCAT(COLUMN_VECTOR_SCALAR, K0) \
1394  (IDX_COL, BASENAME, BS, TYPE);
1395 
1396 /** Create transposed vectors form the given source vectors
1397  *
1398  * @param[in] K0 The size of source vectors
1399  * @param[in] N0 The number of source vectors
1400  * @param[in] BASENAME The basename of transposed vectors
1401  * @param[in] BS The basename of source vectors for transposition
1402  * @param[in] TYPE The data type of the transposed vectors
1403  *
1404  */
1405 #define TRANSPOSE_K0XN0(K0, N0, BASENAME, BS, TYPE) \
1406  CONCAT(TRANSPOSE_K0X, N0) \
1407  (K0, BASENAME, BS, TYPE);
1408 
1409 /** Add the variables (BIAS0 to BIASn-1) to the others (BASENAME0 to BASENAMEn-1)
1410  * @name ADD_ROW_n
1411  *
1412  * @param[in] BASENAME The basename of the destination variables
1413  * @param[in] BIAS The basename of the added variables
1414  * @{
1415  */
1416 #define ADD_ROW_1(BASENAME, BIAS) \
1417  BASENAME##0 += BIAS##0;
1418 
1419 #define ADD_ROW_2(BASENAME, BIAS) \
1420  ADD_ROW_1(BASENAME, BIAS) \
1421  BASENAME##1 += BIAS##1;
1422 
1423 #define ADD_ROW_3(BASENAME, BIAS) \
1424  ADD_ROW_2(BASENAME, BIAS) \
1425  BASENAME##2 += BIAS##2;
1426 
1427 #define ADD_ROW_4(BASENAME, BIAS) \
1428  ADD_ROW_3(BASENAME, BIAS) \
1429  BASENAME##3 += BIAS##3;
1430 
1431 #define ADD_ROW_5(BASENAME, BIAS) \
1432  ADD_ROW_4(BASENAME, BIAS) \
1433  BASENAME##4 += BIAS##4;
1434 
1435 #define ADD_ROW_6(BASENAME, BIAS) \
1436  ADD_ROW_5(BASENAME, BIAS) \
1437  BASENAME##5 += BIAS##5;
1438 
1439 #define ADD_ROW_7(BASENAME, BIAS) \
1440  ADD_ROW_6(BASENAME, BIAS) \
1441  BASENAME##6 += BIAS##6;
1442 
1443 #define ADD_ROW_8(BASENAME, BIAS) \
1444  ADD_ROW_7(BASENAME, BIAS) \
1445  BASENAME##7 += BIAS##7;
1446 
1447 #define ADD_ROW_9(BASENAME, BIAS) \
1448  ADD_ROW_8(BASENAME, BIAS) \
1449  BASENAME##8 += BIAS##8;
1450 
1451 #define ADD_ROW_10(BASENAME, BIAS) \
1452  ADD_ROW_9(BASENAME, BIAS) \
1453  BASENAME##9 += BIAS##9;
1454 
1455 #define ADD_ROW_11(BASENAME, BIAS) \
1456  ADD_ROW_10(BASENAME, BIAS) \
1457  BASENAME##A += BIAS##A;
1458 
1459 #define ADD_ROW_12(BASENAME, BIAS) \
1460  ADD_ROW_11(BASENAME, BIAS) \
1461  BASENAME##B += BIAS##B;
1462 
1463 #define ADD_ROW_13(BASENAME, BIAS) \
1464  ADD_ROW_12(BASENAME, BIAS) \
1465  BASENAME##C += BIAS##C;
1466 
1467 #define ADD_ROW_14(BASENAME, BIAS) \
1468  ADD_ROW_13(BASENAME, BIAS) \
1469  BASENAME##D += BIAS##D;
1470 
1471 #define ADD_ROW_15(BASENAME, BIAS) \
1472  ADD_ROW_14(BASENAME, BIAS) \
1473  BASENAME##E += BIAS##E;
1474 
1475 #define ADD_ROW_16(BASENAME, BIAS) \
1476  ADD_ROW_15(BASENAME, BIAS) \
1477  BASENAME##F += BIAS##F;
1478 
1479 /** @} */ // end of group ADD_ROW_n
1480 
1481 /** Add the block (BIAS) to another block (BASENAME)
1482  * @name ADD_BLOCK
1483  *
1484  * Supported cases are N=1,2,3,...,16
1485  *
1486  * @param[in] N The number of vectors in the block
1487  * @param[in] BASENAME The basename of the destination variables
1488  * @param[in] BIAS The basename of the added variables
1489  * @{
1490  */
1491 #define ADD_BLOCK_STR(N, BASENAME, BIAS) ADD_ROW_##N(BASENAME, BIAS)
1492 #define ADD_BLOCK(N, BASENAME, BIAS) ADD_BLOCK_STR(N, BASENAME, BIAS)
1493 /** @} */ // end of group ADD_BLOCK
1494 
1495 /** Broadcast (add single value) to the each element of the destination variables
1496  * @name ADD_ROW_BROADCAST_n
1497  *
1498  * @param[in] BASENAME The basename of the destination variables
1499  * @param[in] BIAS The variable containing the value to add
1500  * @{
1501  */
1502 #define ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1503  BASENAME##0 += BIAS;
1504 
1505 #define ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1506  ADD_ROW_BROADCAST_1(BASENAME, BIAS) \
1507  BASENAME##1 += BIAS;
1508 
1509 #define ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1510  ADD_ROW_BROADCAST_2(BASENAME, BIAS) \
1511  BASENAME##2 += BIAS;
1512 
1513 #define ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1514  ADD_ROW_BROADCAST_3(BASENAME, BIAS) \
1515  BASENAME##3 += BIAS;
1516 
1517 #define ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1518  ADD_ROW_BROADCAST_4(BASENAME, BIAS) \
1519  BASENAME##4 += BIAS;
1520 
1521 #define ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1522  ADD_ROW_BROADCAST_5(BASENAME, BIAS) \
1523  BASENAME##5 += BIAS;
1524 
1525 #define ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1526  ADD_ROW_BROADCAST_6(BASENAME, BIAS) \
1527  BASENAME##6 += BIAS;
1528 
1529 #define ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1530  ADD_ROW_BROADCAST_7(BASENAME, BIAS) \
1531  BASENAME##7 += BIAS;
1532 
1533 #define ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1534  ADD_ROW_BROADCAST_8(BASENAME, BIAS) \
1535  BASENAME##8 += BIAS;
1536 
1537 #define ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1538  ADD_ROW_BROADCAST_9(BASENAME, BIAS) \
1539  BASENAME##9 += BIAS;
1540 
1541 #define ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1542  ADD_ROW_BROADCAST_10(BASENAME, BIAS) \
1543  BASENAME##A += BIAS;
1544 
1545 #define ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1546  ADD_ROW_BROADCAST_11(BASENAME, BIAS) \
1547  BASENAME##B += BIAS;
1548 
1549 #define ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1550  ADD_ROW_BROADCAST_12(BASENAME, BIAS) \
1551  BASENAME##C += BIAS;
1552 
1553 #define ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1554  ADD_ROW_BROADCAST_13(BASENAME, BIAS) \
1555  BASENAME##D += BIAS;
1556 
1557 #define ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1558  ADD_ROW_BROADCAST_14(BASENAME, BIAS) \
1559  BASENAME##E += BIAS;
1560 
1561 #define ADD_ROW_BROADCAST_16(BASENAME, BIAS) \
1562  ADD_ROW_BROADCAST_15(BASENAME, BIAS) \
1563  BASENAME##F += BIAS;
1564 
1565 /** Broadcast (add a value) to the each element of the destination block (BASENAME)
1566  * @name ADD_BLOCK_BROADCAST
1567  *
1568  * Supported cases are N=1,2,3,...,16.
1569  *
1570  * @param[in] N The number of vectors in the block
1571  * @param[in] BASENAME The basename of the destination variables
1572  * @param[in] BIAS The variable containing the value to add
1573  * @{
1574  */
1575 #define ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS) ADD_ROW_BROADCAST_##N(BASENAME, BIAS)
1576 #define ADD_BLOCK_BROADCAST(N, BASENAME, BIAS) ADD_BLOCK_BROADCAST_STR(N, BASENAME, BIAS)
1577 /** @} */ // end of group ADD_BLOCK_BROADCAST
1578 
1579 /** Apply activation to the given variables
1580  * @name ACTIVATION_ROW_n
1581  *
1582  * @param[in] ACTIVATION_TYPE The type of the activation
1583  * @param[in] DATA_TYPE The data type of the vectors
1584  * @param[in] BASENAME The basename of the variables
1585  * @param[in] A_VAL Additional value required by the activation
1586  * @param[in] B_VAL Additional value required by the activation
1587  * @{
1588  */
1589 #define ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1590  BASENAME##0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##0, A_VAL, B_VAL);
1591 
1592 #define ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1593  ACTIVATION_ROW_1(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1594  BASENAME##1 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##1, A_VAL, B_VAL);
1595 
1596 #define ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1597  ACTIVATION_ROW_2(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1598  BASENAME##2 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##2, A_VAL, B_VAL);
1599 
1600 #define ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1601  ACTIVATION_ROW_3(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1602  BASENAME##3 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##3, A_VAL, B_VAL);
1603 
1604 #define ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1605  ACTIVATION_ROW_4(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1606  BASENAME##4 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##4, A_VAL, B_VAL);
1607 
1608 #define ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1609  ACTIVATION_ROW_5(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1610  BASENAME##5 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##5, A_VAL, B_VAL);
1611 
1612 #define ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1613  ACTIVATION_ROW_6(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1614  BASENAME##6 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##6, A_VAL, B_VAL);
1615 
1616 #define ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1617  ACTIVATION_ROW_7(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1618  BASENAME##7 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##7, A_VAL, B_VAL);
1619 
1620 #define ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1621  ACTIVATION_ROW_8(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1622  BASENAME##8 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##8, A_VAL, B_VAL);
1623 
1624 #define ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1625  ACTIVATION_ROW_9(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1626  BASENAME##9 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##9, A_VAL, B_VAL);
1627 
1628 #define ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1629  ACTIVATION_ROW_10(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1630  BASENAME##A = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##A, A_VAL, B_VAL);
1631 
1632 #define ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1633  ACTIVATION_ROW_11(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1634  BASENAME##B = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##B, A_VAL, B_VAL);
1635 
1636 #define ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1637  ACTIVATION_ROW_12(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1638  BASENAME##C = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##C, A_VAL, B_VAL);
1639 
1640 #define ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1641  ACTIVATION_ROW_13(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1642  BASENAME##D = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##D, A_VAL, B_VAL);
1643 
1644 #define ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1645  ACTIVATION_ROW_14(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1646  BASENAME##E = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##E, A_VAL, B_VAL);
1647 
1648 #define ACTIVATION_ROW_16(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1649  ACTIVATION_ROW_15(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) \
1650  BASENAME##F = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME##F, A_VAL, B_VAL);
1651 /** @} */ // end of group ACTIVATION_ROW_n
1652 
1653 /** Apply activation to a block (BASENAME)
1654  * @name ACTIVATION_BLOCK
1655  *
1656  * Supported cases are N=1,2,3,...,16.
1657  *
1658  * @param[in] N The number of vectors in the block
1659  * @param[in] ACTIVATION_TYPE The type of the activation
1660  * @param[in] DATA_TYPE The data type of the vectors
1661  * @param[in] BASENAME The basename of the variables
1662  * @param[in] A_VAL Additional value required by the activation
1663  * @param[in] B_VAL Additional value required by the activation
1664  * @{
1665  */
1666 #define ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_ROW_##N(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
1667 #define ACTIVATION_BLOCK(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL) ACTIVATION_BLOCK_STR(N, ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, BASENAME, A_VAL, B_VAL)
1668 /** @} */ // end of group ACTIVATION_BLOCK
1669 
1670 /** Apply convert_<data_type> to the given variables
1671  * @name CONVERT_ROW_n
1672  *
1673  * @param[in] N The size of the vectors
1674  * @param[in] DATA_TYPE The data type of the vectors
1675  * @param[in] BASENAME_SRC The basename of the source variables
1676  * @param[in] BASENAME_DST The basename of the destination variables
1677  */
1678 #define CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1679  VEC_DATA_TYPE(DATA_TYPE, N) \
1680  BASENAME_DST##0 = CONVERT(BASENAME_SRC##0, VEC_DATA_TYPE(DATA_TYPE, N));
1681 
1682 #define CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1683  CONVERT_ROW_1(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1684  VEC_DATA_TYPE(DATA_TYPE, N) \
1685  BASENAME_DST##1 = CONVERT(BASENAME_SRC##1, VEC_DATA_TYPE(DATA_TYPE, N));
1686 
1687 #define CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1688  CONVERT_ROW_2(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1689  VEC_DATA_TYPE(DATA_TYPE, N) \
1690  BASENAME_DST##2 = CONVERT(BASENAME_SRC##2, VEC_DATA_TYPE(DATA_TYPE, N));
1691 
1692 #define CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1693  CONVERT_ROW_3(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1694  VEC_DATA_TYPE(DATA_TYPE, N) \
1695  BASENAME_DST##3 = CONVERT(BASENAME_SRC##3, VEC_DATA_TYPE(DATA_TYPE, N));
1696 
1697 #define CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1698  CONVERT_ROW_4(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1699  VEC_DATA_TYPE(DATA_TYPE, N) \
1700  BASENAME_DST##4 = CONVERT(BASENAME_SRC##4, VEC_DATA_TYPE(DATA_TYPE, N));
1701 
1702 #define CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1703  CONVERT_ROW_5(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1704  VEC_DATA_TYPE(DATA_TYPE, N) \
1705  BASENAME_DST##5 = CONVERT(BASENAME_SRC##5, VEC_DATA_TYPE(DATA_TYPE, N));
1706 
1707 #define CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1708  CONVERT_ROW_6(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1709  VEC_DATA_TYPE(DATA_TYPE, N) \
1710  BASENAME_DST##6 = CONVERT(BASENAME_SRC##6, VEC_DATA_TYPE(DATA_TYPE, N));
1711 
1712 #define CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1713  CONVERT_ROW_7(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1714  VEC_DATA_TYPE(DATA_TYPE, N) \
1715  BASENAME_DST##7 = CONVERT(BASENAME_SRC##7, VEC_DATA_TYPE(DATA_TYPE, N));
1716 
1717 #define CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1718  CONVERT_ROW_8(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1719  VEC_DATA_TYPE(DATA_TYPE, N) \
1720  BASENAME_DST##8 = CONVERT(BASENAME_SRC##8, VEC_DATA_TYPE(DATA_TYPE, N));
1721 
1722 #define CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1723  CONVERT_ROW_9(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1724  VEC_DATA_TYPE(DATA_TYPE, N) \
1725  BASENAME_DST##9 = CONVERT(BASENAME_SRC##9, VEC_DATA_TYPE(DATA_TYPE, N));
1726 
1727 #define CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1728  CONVERT_ROW_10(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1729  VEC_DATA_TYPE(DATA_TYPE, N) \
1730  BASENAME_DST##A = CONVERT(BASENAME_SRC##A, VEC_DATA_TYPE(DATA_TYPE, N));
1731 
1732 #define CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1733  CONVERT_ROW_11(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1734  VEC_DATA_TYPE(DATA_TYPE, N) \
1735  BASENAME_DST##B = CONVERT(BASENAME_SRC##B, VEC_DATA_TYPE(DATA_TYPE, N));
1736 
1737 #define CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1738  CONVERT_ROW_12(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1739  VEC_DATA_TYPE(DATA_TYPE, N) \
1740  BASENAME_DST##C = CONVERT(BASENAME_SRC##C, VEC_DATA_TYPE(DATA_TYPE, N));
1741 
1742 #define CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1743  CONVERT_ROW_13(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1744  VEC_DATA_TYPE(DATA_TYPE, N) \
1745  BASENAME_DST##D = CONVERT(BASENAME_SRC##D, VEC_DATA_TYPE(DATA_TYPE, N));
1746 
1747 #define CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1748  CONVERT_ROW_14(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1749  VEC_DATA_TYPE(DATA_TYPE, N) \
1750  BASENAME_DST##E = CONVERT(BASENAME_SRC##E, VEC_DATA_TYPE(DATA_TYPE, N));
1751 
1752 #define CONVERT_ROW_16(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1753  CONVERT_ROW_15(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) \
1754  VEC_DATA_TYPE(DATA_TYPE, N) \
1755  BASENAME_DST##F = CONVERT(BASENAME_SRC##F, VEC_DATA_TYPE(DATA_TYPE, N));
1756 /** @} */ // end of group CONVERT_ROW_n
1757 
1758 /** Apply convert_<data_type> to a block (BASENAME_SRC) and save to another block (BASENAME_DST)
1759  * @name CONVERT_BLOCK
1760  *
1761  * Supported cases N=1,2,3,...,16.
1762  *
1763  * @param[in] M The number of vectors to convert
1764  * @param[in] N The size of the vectors
1765  * @param[in] DATA_TYPE The data type of the vectors
1766  * @param[in] BASENAME_SRC The basename of the source variables
1767  * @param[in] BASENAME_DST The basename of the destination variables
1768  */
1769 #define CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_ROW_##M(N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
1770 #define CONVERT_BLOCK(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST) CONVERT_BLOCK_STR(M, N, DATA_TYPE, BASENAME_SRC, BASENAME_DST)
1771 /** @} */ // end of group CONVERT_BLOCK