27 #define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
29 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
30 #define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
31 #define VEC_QUANT VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
32 #define CONVERT_RTE(x, type) (convert_##type##_rte((x)))
33 #define CONVERT_DOWN(x, type) CONVERT_RTE(x, type)
34 inline VEC_QUANT requantize(VEC_QUANT
input,
float in_offset,
float out_offset,
float in_scale,
float out_scale)
43 #if defined(DATA_TYPE)
44 #define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
46 #if defined(ELEMENT_SIZE)
48 #define SELECT_TYPE SELECT_VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
49 #define SEQ VEC_OFFS(int, VEC_SIZE)
51 #if defined(CONCATENATE_WIDTH_X2)
91 __kernel
void concatenate_width_x2(
96 const int INPUT1_WIDTH)
100 const int y = get_global_id(1);
101 const int z = get_global_id(2) % (int)DEPTH;
102 const int w = get_global_id(2) / (int)DEPTH;
103 const int x1 = min(x, (
int)INPUT1_WIDTH - (
int)
VEC_SIZE);
104 const int x2 = max(x - (
int)INPUT1_WIDTH, 0);
107 const __global uchar *dst_addr = dst_ptr + (int)dst_offset_first_element_in_bytes + x *
sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (
int)dst_stride_z +
w * (int)dst_stride_w;
108 const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 *
sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (
int)src1_stride_z +
w * (int)src1_stride_w;
109 const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 *
sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (
int)src2_stride_z +
w * (int)src2_stride_w;
111 VEC_TYPE src1_values =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
112 VEC_TYPE src2_values =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
114 #if defined(OFFSET_IN1) && defined(OFFSET_IN2) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_IN2) && defined(SCALE_OUT)
115 src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
116 src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
118 const VEC_INT x_coords = SEQ + (VEC_INT)(x);
121 SELECT_TYPE cond =
CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH) && ((VEC_INT)x > (VEC_INT)(INPUT1_WIDTH -
VEC_SIZE)), SELECT_TYPE);
125 cond =
CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
126 const VEC_TYPE values0 =
select(src2_values, src1_values, cond);
130 #endif // defined(CONCATENATE_WIDTH_X2)
132 #if defined(CONCATENATE_WIDTH_X4)
195 __kernel
void concatenate_width_x4(
202 const int INPUT1_WIDTH,
203 const int INPUT2_WIDTH,
204 const int INPUT3_WIDTH)
208 const int y = get_global_id(1);
209 const int z = get_global_id(2) % (int)DEPTH;
210 const int w = get_global_id(2) / (int)DEPTH;
212 const int x1 = min(x, (
int)INPUT1_WIDTH - (
int)
VEC_SIZE);
213 const int x2 = min(max(x - (
int)INPUT1_WIDTH, 0), (
int)INPUT2_WIDTH - (
int)
VEC_SIZE);
214 const int x3 = min(max(x - (
int)INPUT1_WIDTH - (
int)INPUT2_WIDTH, 0), (
int)INPUT3_WIDTH - (
int)
VEC_SIZE);
215 const int x4 = max(x - (
int)INPUT1_WIDTH - (
int)INPUT2_WIDTH - (
int)INPUT3_WIDTH, 0);
218 const __global uchar *dst_addr = dst_ptr + (int)dst_offset_first_element_in_bytes + x *
sizeof(DATA_TYPE) + y * (int)dst_stride_y + z * (
int)dst_stride_z +
w * (int)dst_stride_w;
219 const __global uchar *src1_addr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 *
sizeof(DATA_TYPE) + y * (int)src1_stride_y + z * (
int)src1_stride_z +
w * (int)src1_stride_w;
220 const __global uchar *src2_addr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 *
sizeof(DATA_TYPE) + y * (int)src2_stride_y + z * (
int)src2_stride_z +
w * (int)src2_stride_w;
221 const __global uchar *src3_addr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 *
sizeof(DATA_TYPE) + y * (int)src3_stride_y + z * (
int)src3_stride_z +
w * (int)src3_stride_w;
222 const __global uchar *src4_addr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 *
sizeof(DATA_TYPE) + y * (int)src4_stride_y + z * (
int)src4_stride_z +
w * (int)src4_stride_w;
224 VEC_TYPE src1_values =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src1_addr);
225 VEC_TYPE src2_values =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src2_addr);
226 VEC_TYPE src3_values =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src3_addr);
227 VEC_TYPE src4_values =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src4_addr);
229 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4)
230 src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
231 src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT);
232 src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT);
233 src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT);
236 const VEC_INT x_coords = SEQ + (VEC_INT)(x);
238 SELECT_TYPE cond_in2 =
CONVERT(((VEC_INT)x < (VEC_INT)INPUT1_WIDTH && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH -
VEC_SIZE)), SELECT_TYPE);
239 SELECT_TYPE cond_in3 =
CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH -
VEC_SIZE)), SELECT_TYPE);
240 SELECT_TYPE cond_in4 =
CONVERT(((VEC_INT)x < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH) && (VEC_INT)x > (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH -
VEC_SIZE)), SELECT_TYPE);
252 cond_in2 =
CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH), SELECT_TYPE);
253 cond_in3 =
CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH), SELECT_TYPE);
254 cond_in4 =
CONVERT(x_coords < (VEC_INT)(INPUT1_WIDTH + INPUT2_WIDTH + INPUT3_WIDTH), SELECT_TYPE);
256 VEC_TYPE values0 =
select(src2_values, src1_values, cond_in2);
257 values0 =
select(src3_values, values0, cond_in3);
258 values0 =
select(src4_values, values0, cond_in4);
265 #if defined(WIDTH_OFFSET) && defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER)
266 #if defined(CONCATENATE_WIDTH)
296 __kernel
void concatenate_width(
303 const int y = get_global_id(1);
304 const int z = get_global_id(2) % (int)DEPTH;
305 const int w = get_global_id(2) / (int)DEPTH;
307 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x *
sizeof(DATA_TYPE) + y * src_stride_y + z * src_stride_z +
w * src_stride_w;
308 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x *
sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z +
w * dst_stride_w;
310 VEC_TYPE source_values0 =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
312 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
313 const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
314 STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + WIDTH_OFFSET *
sizeof(DATA_TYPE),
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
316 STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + WIDTH_OFFSET *
sizeof(DATA_TYPE),
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
322 #if defined(VEC_SIZE_LEFTOVER)
323 #if defined(CONCATENATE_HEIGHT)
324 #if defined(HEIGHT_OFFSET) && defined(VEC_SIZE)
355 __kernel
void concatenate_height(
360 const int x_offs = max((
int)(get_global_id(0) *
VEC_SIZE - (
VEC_SIZE - VEC_SIZE_LEFTOVER) %
VEC_SIZE), 0) *
sizeof(DATA_TYPE);
362 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + (get_global_id(2) %
DEPTH) * src_stride_z + (get_global_id(
363 2) /
DEPTH) * src_stride_w;
364 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + (get_global_id(2) %
DEPTH) * dst_stride_z + (get_global_id(
365 2) /
DEPTH) * dst_stride_w;
367 VEC_TYPE source_values0 =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
369 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
370 const VEC_QUANT out0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
371 STORE_VECTOR_SELECT(out, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
373 STORE_VECTOR_SELECT(source_values, DATA_TYPE, dst_addr + HEIGHT_OFFSET * dst_stride_y,
VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
379 #if defined(CONCATENATE)
404 __kernel
void concatenate(
409 uint x_offs = max((
int)(get_global_id(0) *
VEC_SIZE *
sizeof(DATA_TYPE) - (
VEC_SIZE - VEC_SIZE_LEFTOVER) %
VEC_SIZE *
sizeof(DATA_TYPE)), 0);
411 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x_offs + get_global_id(1) * src_stride_y + get_global_id(2) * src_stride_z;
412 __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x_offs + get_global_id(1) * dst_stride_y + get_global_id(2) * dst_stride_z;
414 VEC_TYPE source_values0 =
VLOAD(
VEC_SIZE)(0, (__global DATA_TYPE *)src_addr);
416 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
417 source_values0 = requantize(source_values0, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT);
422 #endif // defined(CONCATENATE)