27 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)
29 #define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
30 #define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE)
31 #define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
32 #define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
34 #if defined(FLOAT_DATA_TYPE)
35 #define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y)
36 #define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y)
37 #else // !FLOAT_DATA_TYPE
39 #define ISGREATER(x, y) (x > y) ? 1 : 0
40 #define ISLESS(x, y) (x < y) ? 1 : 0
41 #else // !defined(WIDTH)
42 #define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y))
43 #define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y))
44 #endif // defined(WIDTH)
45 #endif // defined(FLOAT_DATA_TYPE)
48 #define CONDITION_TO_USE(x, y) ISGREATER(x, y)
49 #elif defined(ARG_MIN)
50 #define CONDITION_TO_USE(x, y) ISLESS(x, y)
51 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
52 #error "Unsupported reduction operation!"
53 #endif // defined(ARG_MAX)
58 #define VECTOR_PREDICATE_EQ(x, y) ((x) >= (y))
59 #define VECTOR_PREDICATE(x, y) ((x) > (y))
60 #define SCALAR_SELECT_OP(x, y) ((x) > (y)) ? (x) : (y);
61 #elif defined(ARG_MIN)
62 #define VECTOR_PREDICATE_EQ(x, y) ((x) <= (y))
63 #define VECTOR_PREDICATE(x, y) ((x) < (y))
64 #define SCALAR_SELECT_OP(x, y) ((x) < (y)) ? (x) : (y);
65 #else // !(defined(ARG_MAX) || defined(ARG_MIN))
66 #error "Unsupported reduction operation!"
67 #endif // defined(ARG_MAX)
69 inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_2(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx,
VEC_DATA_TYPE(DATA_TYPE, 2) in,
VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 2) res)
71 if( VECTOR_PREDICATE_EQ(in.s0,in.s1) )
74 *min_max_idx = res.s0;
79 *min_max_idx = res.s1;
83 inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_4(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx,
VEC_DATA_TYPE(DATA_TYPE, 4) in,
VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 4) res)
86 idx_sel = VECTOR_PREDICATE_EQ(in.s01, in.s23);
87 in.s01 =
select(in.s23, in.s01, idx_sel);
89 idx_sel.s0 = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 &&
CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
90 res.s0 =
select(res.s1, res.s0,
CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
91 *min_max_val = SCALAR_SELECT_OP(in.s0, in.s1);
92 *min_max_idx = res.s0;
95 inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_8(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx,
VEC_DATA_TYPE(DATA_TYPE, 8) in,
VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 8) res)
98 idx_sel = VECTOR_PREDICATE_EQ(in.s0123, in.s4567);
99 in.s0123 =
select(in.s4567, in.s0123, idx_sel);
101 idx_sel.s01 = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 &&
CONVERT(((res.s01 < res.s23)),
VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
102 in.s01 =
select(in.s23, in.s01, idx_sel.s01);
104 idx_sel.s0 = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 &&
CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
105 res.s0 =
select(res.s1, res.s0,
CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
106 *min_max_val = SCALAR_SELECT_OP(in.s0, in.s1);
107 *min_max_idx = res.s0;
110 inline DATA_TYPE_OUTPUT vectorized_compute_arg_min_max_16(DATA_TYPE *min_max_val, DATA_TYPE_OUTPUT *min_max_idx,
VEC_DATA_TYPE(DATA_TYPE, 16) in,
VEC_DATA_TYPE(DATA_TYPE_OUTPUT, 16) res)
113 idx_sel = VECTOR_PREDICATE_EQ(in.s01234567, in.s89abcdef);
114 in.s01234567 =
select(in.s89abcdef, in.s01234567, idx_sel);
116 idx_sel.s0123 = VECTOR_PREDICATE(in.s0123, in.s4567) || (in.s0123 == in.s4567 &&
CONVERT(((res.s0123 < res.s4567)),
VEC_DATA_TYPE(COND_DATA_TYPE, 4)));
117 in.s0123 =
select(in.s4567, in.s0123, idx_sel.s0123);
119 idx_sel.s01 = (VECTOR_PREDICATE(in.s01, in.s23)) || (in.s01 == in.s23 &&
CONVERT(((res.s01 < res.s23)),
VEC_DATA_TYPE(COND_DATA_TYPE, 2)));
120 in.s01 =
select(in.s23, in.s01, idx_sel.s01);
122 idx_sel.s0 = VECTOR_PREDICATE(in.s0, in.s1) || (in.s0 == in.s1 &&
CONVERT((res.s0 < res.s1), COND_DATA_TYPE));
123 res.s0 =
select(res.s1, res.s0,
CONVERT(idx_sel.s0, DATA_TYPE_OUTPUT));
124 *min_max_val = SCALAR_SELECT_OP(in.s0, in.s1);
125 *min_max_idx = res.s0;
130 inline
void scalar_compute_global_min_max(DATA_TYPE in_val,
int idx, DATA_TYPE *out_min_max_val, DATA_TYPE_OUTPUT *out_idx)
133 if(in_val > *out_min_max_val)
134 #else // defined(ARG_MAX)
135 if(in_val < *out_min_max_val)
136 #endif // defined(ARG_MAX)
138 *out_min_max_val = in_val;
145 #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_16(min_max_val,min_max_idx,in,res)
146 #elif VEC_SIZE == 8 // #if VEC_SIZE == 16
147 #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_8(min_max_val,min_max_idx,in,res)
148 #elif VEC_SIZE == 4 // # elif VEC_SIZE == 8
149 #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_4(min_max_val,min_max_idx,in,res)
150 #elif VEC_SIZE == 2 // elif VEC_SIZE == 4
151 #define VECTORIZED_OP(min_max_val,min_max_idx,in,res) vectorized_compute_arg_min_max_2(min_max_val,min_max_idx,in,res)
152 #else // elif VEC_SIZE == 2
153 #error "Not supported"
154 #endif // #if VEC_SIZE == 16
160 vidx = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
161 #elif VEC_SIZE == 8 // #if VEC_SIZE == 16
163 vidx = { 0, 1, 2, 3, 4, 5, 6, 7 };
164 #elif VEC_SIZE == 4 // elif VEC_SIZE == 8
166 vidx = { 0, 1, 2, 3 };
167 #elif VEC_SIZE == 2 // elif VEC_SIZE == 4
170 #else // elif VEC_SIZE == 2
171 #error "Not supported"
172 #endif // #if VEC_SIZE == 16
175 #endif // VEC_SIZE > 1
197 __kernel
void arg_min_max_x(
201 __global DATA_TYPE *input_addr = (__global DATA_TYPE *)(input_ptr + input_offset_first_element_in_bytes + get_global_id(1) * input_stride_y);
202 __global DATA_TYPE_OUTPUT *output_addr = (__global DATA_TYPE_OUTPUT *)(output_ptr + output_offset_first_element_in_bytes + get_global_id(1) * output_stride_y);
204 DATA_TYPE final_value = input_addr[0];
205 DATA_TYPE_OUTPUT final_idx = 0;
209 vidx = init_idx_vector();
216 DATA_TYPE local_min_max_value;
217 DATA_TYPE_OUTPUT local_min_max_idx;
219 VECTORIZED_OP(&local_min_max_value, &local_min_max_idx, vals, vidx);
220 local_min_max_idx += x;
221 scalar_compute_global_min_max(local_min_max_value, local_min_max_idx, &final_value, &final_idx);
223 #endif // VEC_SIZE > 1
225 #if(WIDTH % VEC_SIZE)
228 scalar_compute_global_min_max(*(input_addr + j + x), j + x, &final_value, &final_idx);
232 output_addr[0] = final_idx;
234 #endif // defined(WIDTH)
257 __kernel
void arg_min_max_y(
262 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y;
263 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
267 VEC_TYPE_OUT indx0 = 0;
268 for(DATA_TYPE_OUTPUT y = 1; y <
HEIGHT; ++y)
270 VEC_TYPE_IN in =
CONVERT(
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + y * input_stride_y)), VEC_TYPE_IN);
272 VEC_TYPE_OUT cond_conv =
CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
273 indx0 =
select(indx0, (VEC_TYPE_OUT)y, cond_conv);
274 res =
select(res, in, CONDITION_TO_USE(in, res));
278 STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
280 #endif // defined(HEIGHT)
282 #if defined(DEPTH) && !defined(BATCH)
306 __kernel
void arg_min_max_z(
312 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
313 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
317 VEC_TYPE_OUT indx0 = 0;
318 for(DATA_TYPE_OUTPUT z = 1; z <
DEPTH; ++z)
320 VEC_TYPE_IN in =
CONVERT(
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + z * input_stride_z)), VEC_TYPE_IN);
322 VEC_TYPE_OUT cond_conv =
CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
323 indx0 =
select(indx0, (VEC_TYPE_OUT)z, cond_conv);
324 res =
select(res, in, CONDITION_TO_USE(in, res));
328 STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
332 #if defined(BATCH) && defined(DEPTH)
361 __kernel
void arg_min_max_w(
367 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) %
DEPTH) * input_stride_z +
368 (get_global_id(2) /
DEPTH) * input_stride_w;
369 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
370 2) %
DEPTH) * output_stride_z + (get_global_id(2) /
DEPTH) * output_stride_w;
374 VEC_TYPE_OUT indx0 = 0;
375 for(DATA_TYPE_OUTPUT
w = 1;
w < BATCH; ++
w)
377 VEC_TYPE_IN in =
CONVERT(
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr +
w * input_stride_w)), VEC_TYPE_IN);
379 VEC_TYPE_OUT cond_conv =
CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
380 indx0 =
select(indx0, (VEC_TYPE_OUT)
w, cond_conv);
381 res =
select(res, in, CONDITION_TO_USE(in, res));
385 STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
388 #endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT)