Compute Library
 21.02
reduction_operation.cl
Go to the documentation of this file.
1 /*
2  * Copyright (c) 2016-2020 Arm Limited.
3  *
4  * SPDX-License-Identifier: MIT
5  *
6  * Permission is hereby granted, free of charge, to any person obtaining a copy
7  * of this software and associated documentation files (the "Software"), to
8  * deal in the Software without restriction, including without limitation the
9  * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
10  * sell copies of the Software, and to permit persons to whom the Software is
11  * furnished to do so, subject to the following conditions:
12  *
13  * The above copyright notice and this permission notice shall be included in all
14  * copies or substantial portions of the Software.
15  *
16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 #include "helpers.h"
25 #include "helpers_asymm.h"
26 
27 #if defined(FLOAT_DATA_TYPE)
28 #define ISGREATER(x, y) isgreater(x, y)
29 #define ISLESS(x, y) isless(x, y)
30 #else // !FLOAT_DATA_TYPE
31 #if defined(WIDTH)
32 #define ISGREATER(x, y) (x > y) ? 1 : 0
33 #define ISLESS(x, y) (x < y) ? 1 : 0
34 #else // !defined(WIDTH)
35 #define ISGREATER(x, y) select((int16)0, (int16)-1, x > y)
36 #define ISLESS(x, y) select((int16)0, (int16)-1, x < y)
37 #endif // defined(WIDTH)
38 #endif // defined(FLOAT_DATA_TYPE)
39 
40 /** Calculate square sum of a vector
41  *
42  * @param[in] input Pointer to the first pixel.
43  *
44  * @return square sum of vector.
45  */
46 inline DATA_TYPE square_sum(__global const DATA_TYPE *input)
47 {
49  in = vload16(0, input);
50 
51  in *= in;
52 
53  in.s01234567 += in.s89ABCDEF;
54  in.s0123 += in.s4567;
55  in.s01 += in.s23;
56 
57  return (in.s0 + in.s1);
58 }
59 
60 /** Calculate sum of a vector
61  *
62  * @param[in] input Pointer to the first pixel.
63  *
64  * @return sum of vector.
65  */
66 inline DATA_TYPE sum(__global const DATA_TYPE *input)
67 {
69  in = vload16(0, input);
70 
71  in.s01234567 += in.s89ABCDEF;
72  in.s0123 += in.s4567;
73  in.s01 += in.s23;
74 
75  return (in.s0 + in.s1);
76 }
77 
78 /** Calculate product of a vector
79  *
80  * @param[in] input Pointer to the first pixel.
81  *
82  * @return product of vector.
83  */
84 inline DATA_TYPE product(__global const DATA_TYPE *input)
85 {
87  in = vload16(0, input);
88 
89  in.s01234567 *= in.s89ABCDEF;
90  in.s0123 *= in.s4567;
91  in.s01 *= in.s23;
92 
93  return (in.s0 * in.s1);
94 }
95 #if defined(OPERATION)
96 /** This kernel performs parallel reduction given an operation on x-axis.
97  *
98  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
99  * @note The operation we want to perform must be passed at compile time using -DOPERATION e.g. -DOPERATION=square_sum
100  * @note The mean flag must be passed at compile time using -DMEAN if we want to compute the mean value
101  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
102  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 if we want to compute the mean value
103  *
104  * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32
105  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
106  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
107  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
108  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
109  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
110  * @param[in] partial_res_ptr The local buffer to hold partial result values. Supported data types: same as @p src_ptr
111  * @param[in] partial_res_stride_x Stride of the output tensor in X dimension (in bytes)
112  * @param[in] partial_res_step_x partial_res_stride_x * number of elements along X processed per workitem(in bytes)
113  * @param[in] partial_res_stride_y Stride of the output tensor in Y dimension (in bytes)
114  * @param[in] partial_res_step_y partial_res_stride_y * number of elements along Y processed per workitem(in bytes)
115  * @param[in] partial_res_offset_first_element_in_bytes The offset of the first element in the source tensor
116  * @param[in] local_results Local buffer for storing the partial result
117  */
118 __kernel void reduction_operation_x(
120  IMAGE_DECLARATION(partial_res),
121  __local DATA_TYPE *local_results)
122 {
124  Image partial_res = CONVERT_TO_IMAGE_STRUCT(partial_res);
125 
126  unsigned int lsize = get_local_size(0);
127  unsigned int lid = get_local_id(0);
128 
129  for(unsigned int y = 0; y < get_local_size(1); ++y)
130  {
131  local_results[lid] = OPERATION((__global DATA_TYPE *)offset(&src, 0, y));
132  barrier(CLK_LOCAL_MEM_FENCE);
133 
134  // Perform parallel reduction
135  for(unsigned int i = lsize >> 1; i > 0; i >>= 1)
136  {
137  if(lid < i)
138  {
139 #if defined(PROD)
140  local_results[lid] *= local_results[lid + i];
141 #else // !defined(PROD)
142  local_results[lid] += local_results[lid + i];
143 #endif // defined(PROD)
144  }
145  barrier(CLK_LOCAL_MEM_FENCE);
146  }
147 
148  if(lid == 0)
149  {
150 #if defined(MEAN) && defined(WIDTH)
151  if(y == get_local_size(1) - 1)
152  {
153  local_results[0] /= WIDTH;
154  }
155 #endif // defined(MEAN) && defined(WIDTH)
156  ((__global DATA_TYPE *)offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
157  }
158  }
159 }
160 #endif // defined(OPERATION)
161 
162 #if defined(WIDTH)
163 /** This kernel performs reduction on x-axis. (Non parallel)
164  *
165  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
166  * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128
167  * @note The product flag must be passed at compile time using -DPROD if we want to compute the product, otherwise sum will be used
168  * @note In case of MIN and MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short
169  *
170  * @param[in] src_ptr Pointer to the source tensor. Supported data types: S32/F16/F32 and QASYMM8/QASYMM8_SIGNED for operation MEAN
171  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
172  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
173  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
174  * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptr
175  * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
176  * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
177  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
178  */
179 __kernel void reduction_operation_non_parallel_x(
181  VECTOR_DECLARATION(output))
182 {
184  Vector output = CONVERT_TO_VECTOR_STRUCT(output);
185 
186  DATA_TYPE_PROMOTED res = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, 0)), DATA_TYPE_PROMOTED);
187 
188  // Convert input into F32 in order to perform quantized multiplication
189 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
190  float res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
191 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
192 
193  for(unsigned int x = 1; x < WIDTH; ++x)
194  {
195  DATA_TYPE_PROMOTED in = CONVERT(*((__global DATA_TYPE *)vector_offset(&src, x)), DATA_TYPE_PROMOTED);
196 #if defined(MIN)
197  res = select(res, in, CONVERT(ISLESS(in, res), COND_DATA_TYPE));
198 #elif defined(MAX)
199  res = select(res, in, CONVERT(ISGREATER(in, res), COND_DATA_TYPE));
200 #elif defined(PROD)
201 #if defined(OFFSET) && defined(SCALE)
202  res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
203 #else // !(defined(OFFSET) && defined(SCALE))
204  res *= in;
205 #endif // defined(OFFSET) && defined(SCALE)
206 #else // defined(SUM))
207  res += in;
208 #endif // defined(MAX) || defined(MIN) || defined(PROD)
209  }
210 
211  // Store result
212 #if defined(MEAN)
213  res /= WIDTH;
214 #endif // defined(MEAN)
215 
216  // Subtract the offsets in case of quantized SUM
217 #if defined(SUM) && defined(OFFSET) && defined(SCALE)
218  res -= (WIDTH - 1) * OFFSET;
219 #endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
220 
221  // Re-quantize
222 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
223  res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 1);
224 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
225 
226  *((__global DATA_TYPE *)output.ptr) = CONVERT_SAT(res, DATA_TYPE);
227 }
228 #endif // defined(WIDTH)
229 
230 #if defined(HEIGHT)
231 /** This kernel performs reduction on y-axis.
232  *
233  * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
234  * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128
235  *
236  * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
237  * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
238  * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
239  * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
240  * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
241  * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
242  * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p src_ptr
243  * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
244  * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
245  * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
246  * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
247  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
248  */
249 __kernel void reduction_operation_y(
251  IMAGE_DECLARATION(output))
252 {
254  Image output = CONVERT_TO_IMAGE_STRUCT(output);
255 
256  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
257  res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
258 
259  // Convert input into F32 in order to perform quantized multiplication
260 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
261  float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
262 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
263 
264 #if defined(SUM_SQUARE)
265  res *= res;
266 #endif // defined(SUM_SQUARE)
267 
268  for(unsigned int y = 1; y < HEIGHT; ++y)
269  {
270  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
271  in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
272 #if defined(MIN)
273  res = select(res, in, ISLESS(in, res));
274 #elif defined(MAX)
275  res = select(res, in, ISGREATER(in, res));
276 #else // !(defined(MAX) || defined(MIN))
277 #if defined(SUM_SQUARE)
278  in *= in;
279 #endif // defined(SUM_SQUARE)
280 #if defined(PROD)
281 
282 #if defined(OFFSET) && defined(SCALE)
283  res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
284 #else // !(defined(OFFSET) && defined(SCALE))
285  res *= in;
286 #endif // defined(OFFSET) && defined(SCALE)
287 
288 #else // !defined(PROD)
289  res += in;
290 #endif // defined(PROD)
291 #endif // defined(MAX) || defined(MIN)
292  }
293 
294 #if defined(MEAN)
295  res /= HEIGHT;
296 #endif // defined(MEAN)
297 
298  // Subtract the offsets in case of quantized SUM
299 #if defined(SUM) && defined(OFFSET) && defined(SCALE)
300  res -= (HEIGHT - 1) * OFFSET;
301 #endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
302 
303  // Re-quantize
304 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
305  res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
306 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
307 
308  // Store result
309  vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
310 }
311 #endif // defined(HEIGHT)
312 
313 #if defined(DEPTH)
314 /** This kernel performs reduction on z-axis.
315  *
316  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
317  * @note The depth size must be passed at compile time using -DDEPTH e.g. -DDEPTH=128
318  *
319  * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
320  * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
321  * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
322  * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
323  * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
324  * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
325  * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
326  * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
327  * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr
328  * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
329  * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
330  * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
331  * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
332  * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
333  * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
334  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
335  */
336 __kernel void reduction_operation_z(
338  TENSOR3D_DECLARATION(output))
339 {
341  Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
342 
343  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
344  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
345 
346  // Convert input into F32 in order to perform quantized multiplication
347 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
348  float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
349 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
350 
351 #if defined(COMPLEX)
352  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
353  res1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
354 #endif // defined(COMPLEX)
355 #if defined(SUM_SQUARE)
356  res *= res;
357 #endif // defined(SUM_SQUARE)
358 
359  for(unsigned int z = 1; z < DEPTH; ++z)
360  {
361  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
362  in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
363 
364 #if defined(COMPLEX)
365  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
366  in1 = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 8, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
367 #endif // defined(COMPLEX)
368 
369 #if defined(MIN)
370  res = select(res, in, ISLESS(in, res));
371 #elif defined(MAX)
372  res = select(res, in, ISGREATER(in, res));
373 #else // !(defined(MAX) || defined(MIN))
374 #if defined(SUM_SQUARE)
375  in *= in;
376 #endif // defined(SUM_SQUARE)
377 #if defined(PROD)
378 
379 #if defined(OFFSET) && defined(SCALE)
380  res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
381 #else // !(defined(OFFSET) && defined(SCALE))
382  res *= in;
383 #endif // defined(OFFSET) && defined(SCALE)
384 
385 #else // !defined(PROD)
386  res += in;
387 #if defined(COMPLEX)
388  res1 += in1;
389 #endif // defined(COMPLEX)
390 #endif // defined(PROD)
391 #endif // defined(MAX) || defined(MIN)
392  }
393 
394 #if defined(MEAN)
395  res /= DEPTH;
396 #endif // defined(MEAN)
397 
398  // Subtract the offsets in case of quantized SUM
399 #if defined(SUM) && defined(OFFSET) && defined(SCALE)
400  res -= (DEPTH - 1) * OFFSET;
401 #endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
402 
403  // Re-quantize
404 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
405  res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
406 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
407 
408  // Store result
409  vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
410 #if defined(COMPLEX)
411  vstore16(CONVERT(res1, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)tensor3D_offset(&output, 8, 0, 0));
412 #endif // defined(COMPLEX)
413 }
414 #endif /* defined(DEPTH) */
415 
416 #if defined(BATCH) && defined(DEPTH)
417 /** This kernel performs reduction on w-axis.
418  *
419  * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
420  * @note The batch size must be passed at compile time using -DBATCH e.g. -DBATCH=128
421  * @note The depth size must be passed at compile time using -DBATCH e.g. -DDEPTH=128
422  *
423  * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/S32/F16/F32
424  * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
425  * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
426  * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
427  * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
428  * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
429  * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
430  * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
431  * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
432  * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
433  * @param[in] output_ptr The local buffer to hold sumed values. Supported data types: same as @p input_ptr
434  * @param[in] output_stride_x Stride of the output tensor in X dimension (in bytes)
435  * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
436  * @param[in] output_stride_y Stride of the output tensor in Y dimension (in bytes)
437  * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
438  * @param[in] output_stride_z Stride of the output tensor in Z dimension (in bytes)
439  * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
440  * @param[in] output_stride_w Stride of the output tensor in W dimension (in bytes)
441  * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
442  * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor
443  */
444 __kernel void reduction_operation_w(
446  TENSOR4D_DECLARATION(output))
447 {
448  Tensor4D input = CONVERT_TO_TENSOR4D_STRUCT(input, DEPTH);
449  Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH);
450 
451  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
452  res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
453 
454  // Convert input into F32 in order to perform quantized multiplication
455 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
456  float16 res_f = DEQUANTIZE(res, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
457 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
458 
459 #if defined(SUM_SQUARE)
460  res *= res;
461 #endif // defined(SUM_SQUARE)
462 
463  for(unsigned int w = 1; w < BATCH; ++w)
464  {
465  VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)
466  in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16));
467 
468 #if defined(MIN)
469  res = select(res, in, ISLESS(in, res));
470 #elif defined(MAX)
471  res = select(res, in, ISGREATER(in, res));
472 #else // !(defined(MAX) || defined(MIN))
473 #if defined(SUM_SQUARE)
474  in *= in;
475 #endif // defined(SUM_SQUARE)
476 #if defined(PROD)
477 
478 #if defined(OFFSET) && defined(SCALE)
479  res_f *= DEQUANTIZE(in, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
480 #else // !(defined(OFFSET) && defined(SCALE))
481  res *= in;
482 #endif // defined(OFFSET) && defined(SCALE)
483 
484 #else // !defined(PROD)
485  res += in;
486 #endif //defined(PROD)
487 #endif // defined(MAX) || defined(MIN)
488  }
489 
490 #if defined(MEAN)
491  res /= BATCH;
492 #endif // defined(MEAN)
493 
494  // Subtract the offsets in case of quantized SUM
495 #if defined(SUM) && defined(OFFSET) && defined(SCALE)
496  res -= (BATCH - 1) * OFFSET;
497 #endif // defined(OFFSET) && defined(OFFSET) && defined(SCALE)
498 
499  // Re-quantize
500 #if defined(PROD) && defined(OFFSET) && defined(SCALE)
501  res = QUANTIZE(res_f, OFFSET, SCALE, DATA_TYPE_PROMOTED, 16);
502 #endif // defined(PROD) && defined(OFFSET) && defined(SCALE)
503 
504  // Store result
505  vstore16(CONVERT_SAT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr);
506 }
507 #endif /* defined(BATCH) && defined(DEPTH) */
Structure to hold Vector information.
Definition: helpers.h:651
__global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
Definition: helpers.h:846
SimpleTensor< float > w
Definition: DFT.cpp:156
DATA_TYPE product(__global const DATA_TYPE *input)
Calculate product of a vector.
#define CONVERT(x, type)
Definition: helpers.h:517
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:680
#define CONVERT_TO_IMAGE_STRUCT(name)
Definition: helpers.h:617
#define DATA_TYPE
DATA_TYPE sum(__global const DATA_TYPE *input)
Calculate sum of a vector.
#define IMAGE_DECLARATION(name)
Definition: helpers.h:581
#define CONVERT_SAT(a, b)
Structure to hold 3D tensor information.
Definition: helpers.h:668
SimpleTensor< float > src
Definition: DFT.cpp:155
Structure to hold 4D tensor information.
Definition: helpers.h:678
#define CONVERT_TO_VECTOR_STRUCT(name)
Definition: helpers.h:611
#define DEQUANTIZE(input, offset, scale, type, size)
#define VECTOR_DECLARATION(name)
Definition: helpers.h:575
__global const uchar * tensor4D_offset(const Tensor4D *tensor, int x, int y, int z, int w)
Get the pointer position of a Tensor4D.
Definition: helpers.h:871
#define CONVERT_TO_TENSOR4D_STRUCT(name, mod_size)
Definition: helpers.h:639
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:653
Structure to hold Image information.
Definition: helpers.h:659
#define ISGREATER(x, y)
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:661
#define CONVERT_TO_TENSOR3D_STRUCT(name)
Definition: helpers.h:632
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
Definition: Select.cpp:38
__global const uchar * vector_offset(const Vector *vec, int x)
Get the pointer position of a Vector.
Definition: helpers.h:835
#define ISLESS(x, y)
#define QUANTIZE(input, offset, scale, type, size)
#define TENSOR4D_DECLARATION(name)
Definition: helpers.h:599
__global uchar * ptr
Pointer to the starting postion of the buffer.
Definition: helpers.h:670
#define TENSOR3D_DECLARATION(name)
Definition: helpers.h:589
DATA_TYPE square_sum(__global const DATA_TYPE *input)
Calculate square sum of a vector.
__global const uchar * tensor3D_offset(const Tensor3D *tensor, int x, int y, int z)
Get the pointer position of a Tensor3D.
Definition: helpers.h:858
#define VEC_DATA_TYPE(type, size)
Definition: helpers.h:514