26 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT) 28 #define VEC_TYPE_IN VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 29 #define VEC_TYPE_OUT VEC_DATA_TYPE(DATA_TYPE_OUTPUT, VEC_SIZE) 30 #define VEC_SELECT_IN SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 31 #define VEC_SIGNED_INT_IN SIGNED_INT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) 33 #if defined(FLOAT_DATA_TYPE) 34 #define ISGREATER(x, y) (VEC_SELECT_IN) isgreater(x, y) 35 #define ISLESS(x, y) (VEC_SELECT_IN) isless(x, y) 36 #else // !FLOAT_DATA_TYPE 38 #define ISGREATER(x, y) (x > y) ? 1 : 0 39 #define ISLESS(x, y) (x < y) ? 1 : 0 40 #else // !defined(WIDTH) 41 #define ISGREATER(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x > y)) 42 #define ISLESS(x, y) select((VEC_SIGNED_INT_IN)0, (VEC_SIGNED_INT_IN)-1, (VEC_SIGNED_INT_IN)(x < y)) 43 #endif // defined(WIDTH) 44 #endif // defined(FLOAT_DATA_TYPE) 47 #define CONDITION_TO_USE(x, y) ISGREATER(x, y) 48 #elif defined(ARG_MIN) 49 #define CONDITION_TO_USE(x, y) ISLESS(x, y) 50 #else // !(defined(ARG_MAX) || defined(ARG_MIN)) 51 #error "Unsupported reduction operation!" 52 #endif // defined(ARG_MAX) 56 #if defined(PREV_OUTPUT) 63 inline DATA_TYPE_OUTPUT arg_idx_min_prev_out(__global
const DATA_TYPE *
input, __global
const DATA_TYPE_OUTPUT *prev_res,
const int x_idx)
65 int end_elem = (x_idx + 1) * 16;
68 end_elem =
WIDTH - x_idx * 16;
70 DATA_TYPE_OUTPUT res = prev_res[0];
71 for(
int x_v = 1; x_v < end_elem; ++x_v)
73 res =
select(res, prev_res[x_v], *(input + prev_res[x_v]) < * (input + res));
77 #else // !defined(PREV_OUTPUT) 84 inline DATA_TYPE_OUTPUT arg_idx_min(__global
const DATA_TYPE *input,
const int x_idx)
87 DATA_TYPE_OUTPUT res = 0;
88 for(DATA_TYPE_OUTPUT x_v = res + 1; x_v <
WIDTH; ++x_v)
90 res =
select(res, x_v, *(input + x_v) < * (input + res));
94 int x_elem = x_idx * 16;
95 const int x_goback =
select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
99 in = vload16(0, input - x_goback);
101 res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
104 idx_sel = (in.s01234567 <= in.s89abcdef);
105 in.s01234567 =
select(in.s89abcdef, in.s01234567, idx_sel);
106 res.s01234567 =
select(res.s89abcdef, res.s01234567,
CONVERT(idx_sel, int8));
109 in.s0123 =
select(in.s4567, in.s0123, idx_sel.s0123);
110 res.s0123 =
select(res.s4567, res.s0123,
CONVERT(idx_sel.s0123, int4));
113 in.s01 =
select(in.s23, in.s01, idx_sel.s01);
114 res.s01 =
select(res.s23, res.s01,
CONVERT(idx_sel.s01, int2));
119 return res.s0 + x_elem;
122 #endif // defined(PREV_OUTPUT) 123 #endif // defined(ARG_MIN) 125 #if defined(PREV_OUTPUT) 132 inline DATA_TYPE_OUTPUT arg_idx_max_prev_out(__global
const DATA_TYPE *input, __global
const DATA_TYPE_OUTPUT *prev_res,
const int x_idx)
134 int end_elem = (x_idx + 1) * 16;
137 end_elem = WIDTH - x_idx * 16;
139 DATA_TYPE_OUTPUT res = prev_res[0];
140 for(
int x_v = 1; x_v < end_elem; ++x_v)
142 res =
select(res, prev_res[x_v], *(input + prev_res[x_v]) > *(input + res));
146 #else // !defined(PREV_OUTPUT) 153 inline DATA_TYPE_OUTPUT arg_idx_max(__global
const DATA_TYPE *input,
const int x_idx)
156 DATA_TYPE_OUTPUT res = 0;
157 for(DATA_TYPE_OUTPUT x_v = res + 1; x_v < WIDTH; ++x_v)
159 res =
select(res, x_v, *(input + x_v) > *(input + res));
163 int x_elem = x_idx * 16;
164 const int x_goback =
select(0, 16 - WIDTH % 16, x_elem + 16 > WIDTH);
168 in = vload16(0, input - x_goback);
170 res = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
173 idx_sel = (in.s01234567 >= in.s89abcdef);
174 in.s01234567 =
select(in.s89abcdef, in.s01234567, idx_sel);
175 res.s01234567 =
select(res.s89abcdef, res.s01234567,
CONVERT(idx_sel, int8));
178 in.s0123 =
select(in.s4567, in.s0123, idx_sel.s0123);
179 res.s0123 =
select(res.s4567, res.s0123,
CONVERT(idx_sel.s0123, int4));
182 in.s01 =
select(in.s23, in.s01, idx_sel.s01);
183 res.s01 =
select(res.s23, res.s01,
CONVERT(idx_sel.s01, int2));
188 return res.s0 + x_elem;
191 #endif // defined(PREV_OUTPUT) 192 #endif // defined(ARG_MAX) 222 __kernel
void arg_min_max_x(
224 #
if defined(PREV_OUTPUT)
228 __local DATA_TYPE_OUTPUT *local_results)
230 #if defined(PREV_OUTPUT) 233 #else // !defined(PREV_OUTPUT) 235 #endif // defined(PREV_OUTPUT) 238 unsigned int lsize = get_local_size(0);
239 unsigned int lid = get_local_id(0);
241 const uint x_idx = get_global_id(0);
242 const uint y_idx = get_global_id(1);
243 const __global
DATA_TYPE *src_in_row = (
const __global
DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + y_idx * src_step_y);
245 for(
unsigned int y = 0; y < get_local_size(1); ++y)
248 #if defined(PREV_OUTPUT) 249 local_results[lid] = arg_idx_max_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)
offset(&prev_res, 0, y), x_idx);
250 #else // !defined(PREV_OUTPUT) 251 local_results[lid] = arg_idx_max((__global
DATA_TYPE *)
offset(&src, 0, y), x_idx);
252 #endif // defined(PREV_OUTPUT) 253 #else // defined(ARG_MIN) 254 #if defined(PREV_OUTPUT) 255 local_results[lid] = arg_idx_min_prev_out(src_in_row, (__global DATA_TYPE_OUTPUT *)
offset(&prev_res, 0, y), x_idx);
256 #else // !defined(PREV_OUTPUT) 257 local_results[lid] = arg_idx_min((__global
DATA_TYPE *)
offset(&src, 0, y), x_idx);
258 #endif // defined(PREV_OUTPUT) 259 #endif // defined(ARG_MAX) || defined(ARG_MIN) 261 barrier(CLK_LOCAL_MEM_FENCE);
264 unsigned int middle = lsize - 1;
265 middle |= middle >> 1;
266 middle |= middle >> 2;
269 for(
unsigned int i = middle; i > 0; i >>= 1)
271 if(lid < i && lid + i < lsize)
273 DATA_TYPE tmp0 = *(src_in_row + local_results[lid]);
274 DATA_TYPE tmp1 = *(src_in_row + local_results[lid + i]);
276 local_results[lid] =
select(
278 local_results[lid + i],
279 ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 < tmp1));
280 #else // defined(ARG_MIN) 281 local_results[lid] =
select(
283 local_results[lid + i],
284 ((tmp0 == tmp1) && (local_results[lid + i] < local_results[lid])) || (tmp0 > tmp1));
285 #endif // defined(ARG_MAX) || defined(ARG_MIN) 287 barrier(CLK_LOCAL_MEM_FENCE);
292 ((__global DATA_TYPE_OUTPUT *)
offset(&partial_res, get_group_id(0), y))[0] = local_results[0];
296 #endif // defined(WIDTH) 319 __kernel
void arg_min_max_y(
325 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) + get_global_id(1) *
input_stride_y;
326 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y;
330 VEC_TYPE_OUT indx0 = 0;
331 for(DATA_TYPE_OUTPUT y = 1; y <
HEIGHT; ++y)
335 VEC_TYPE_OUT cond_conv =
CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
336 indx0 =
select(indx0, (VEC_TYPE_OUT)y, cond_conv);
337 res =
select(res, in, CONDITION_TO_USE(in, res));
341 STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
343 #endif // defined(HEIGHT) 345 #if defined(DEPTH) && !defined(BATCH) 369 __kernel
void arg_min_max_z(
375 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) + get_global_id(1) * input_stride_y + get_global_id(2) *
input_stride_z;
376 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
380 VEC_TYPE_OUT indx0 = 0;
381 for(DATA_TYPE_OUTPUT z = 1; z < DEPTH; ++z)
385 VEC_TYPE_OUT cond_conv =
CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
386 indx0 =
select(indx0, (VEC_TYPE_OUT)z, cond_conv);
387 res =
select(res, in, CONDITION_TO_USE(in, res));
391 STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
395 #if defined(BATCH) && defined(DEPTH) 424 __kernel
void arg_min_max_w(
430 __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs *
sizeof(
DATA_TYPE) + get_global_id(1) * input_stride_y + (get_global_id(2) % DEPTH) * input_stride_z +
431 (get_global_id(2) / DEPTH) * input_stride_w;
432 __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs *
sizeof(DATA_TYPE_OUTPUT) + get_global_id(1) * output_stride_y + (get_global_id(
433 2) % DEPTH) * output_stride_z + (get_global_id(2) / DEPTH) * output_stride_w;
437 VEC_TYPE_OUT indx0 = 0;
438 for(DATA_TYPE_OUTPUT
w = 1;
w < BATCH; ++
w)
442 VEC_TYPE_OUT cond_conv =
CONVERT(CONDITION_TO_USE(in, res), VEC_TYPE_OUT);
443 indx0 =
select(indx0, (VEC_TYPE_OUT)
w, cond_conv);
444 res =
select(res, in, CONDITION_TO_USE(in, res));
448 STORE_VECTOR_SELECT(indx, DATA_TYPE_OUTPUT, output_addr,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
451 #endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE_OUTPUT) __global uchar * offset(const Image *img, int x, int y)
Get the pointer position of a Image.
#define CONVERT_TO_IMAGE_STRUCT(name)
#define IMAGE_DECLARATION(name)
const size_t input_stride_y
SimpleTensor< float > src
#define CONVERT_TO_IMAGE_STRUCT_NO_STEP(name)
Structure to hold Image information.
#define STORE_VECTOR_SELECT(basename, data_type, ptr, vec_size, leftover, cond)
Store a vector that can only be partial in x.
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
#define SIGNED_INT_VEC_DATA_TYPE(type, size)
#define SIGNED_INT_DATA_TYPE(type)
#define TENSOR4D_DECLARATION(name)
const size_t input_stride_z
#define TENSOR3D_DECLARATION(name)
#define VEC_DATA_TYPE(type, size)