26 #if defined(POOL_AVG) || defined(POOL_L2)
27 #define POOL_OP(x, y) ((x) + (y))
29 #if defined(QUANTIZED)
30 #define POOL_OP(x, y) (max((x), (y)))
31 #else // defined(QUANTIZED)
32 #define POOL_OP(x, y) (fmax((x), (y)))
33 #endif // defined(QUANTIZED)
37 #define POW2_OP(x, vec_size) ((x) * (x))
39 #define POW2_OP(x, vec_size) (x)
42 #define DIV_OP(x, y) (x * (1.f / y))
43 #define SQRT_OP(x) sqrt((x))
45 #if defined(FP_MIXED_PRECISION) || defined(QUANTIZED)
46 #define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
47 #define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
49 #define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
52 ACC_DATA_TYPE
calculate_avg_scale(
const int pool_size_x,
const int pool_size_y,
const int upper_bound_w,
const int upper_bound_h,
53 const int pad_x,
const int pad_y,
const int stride_x,
const int stride_y)
55 int start_x = get_global_id(0) * stride_x - pad_x;
56 int start_y = get_global_id(1) * stride_y - pad_y;
57 const int end_x = min(start_x + pool_size_x, upper_bound_w);
58 const int end_y = min(start_y + pool_size_y, upper_bound_h);
59 #if defined(EXCLUDE_PADDING)
60 start_x = max(0, start_x);
61 start_y = max(0, start_y);
63 return ((end_y - start_y) * (end_x - start_x));
66 #if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
96 __kernel
void pooling_layer_MxN_nchw(
100 int id0 = get_global_id(0);
101 int id1 = get_global_id(1);
102 int id2 = get_global_id(2);
104 int x_coords = (id0 * STRIDE_X) - PAD_X;
105 int y_coords = (id1 * STRIDE_Y) - PAD_Y;
107 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + y_coords * (int)src_stride_y + id2 * src_stride_z;
110 vdata = INITIAL_VALUE;
111 ACC_DATA_TYPE sdata = INITIAL_VALUE;
113 const
int end_x = min((
int)POOL_SIZE_X, (
int)(SRC_WIDTH - x_coords));
114 const
int end_y = min((
int)POOL_SIZE_Y, (
int)(SRC_HEIGHT - y_coords));
117 for(
int y = 0; y < end_y; ++y)
119 if((y_coords + y) >= 0)
122 for(; x <= (end_x - 8); x += 8)
124 int8 src_x = (int8)(x_coords + x) +
VEC_OFFS(
int, 8);
125 #if defined(POOL_AVG) || defined(POOL_L2)
128 src_x =
clamp(src_x, (int8)0, (int8)(SRC_WIDTH - 1));
131 #else // defined(POOL_AVG) || defined(POOL_L2)
132 src_x =
clamp(src_x, 0, SRC_WIDTH - 1);
135 #endif // defined(POOL_AVG) || defined(POOL_L2
146 for(; x < end_x; ++x)
148 int src_x = x_coords + x;
149 #if defined(POOL_AVG) || defined(POOL_L2)
151 cond_x = (src_x < 0);
152 src_x =
clamp(src_x, 0, SRC_WIDTH - 1);
153 ACC_DATA_TYPE data0 =
select((ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x *
sizeof(DATA_TYPE) + y * src_stride_y))), (ACC_DATA_TYPE)0, cond_x);
154 #else // defined(POOL_AVG) || defined(POOL_L2)
155 src_x =
clamp(src_x, 0, SRC_WIDTH - 1);
156 ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x *
sizeof(DATA_TYPE) + y * src_stride_y)));
157 #endif // defined(POOL_AVG) || defined(POOL_L2)
171 reduce4 =
POOL_OP(vdata.s0123, vdata.s4567);
173 reduce2 =
POOL_OP(reduce4.s01, reduce4.s23);
174 ACC_DATA_TYPE res =
POOL_OP(reduce2.s0, reduce2.s1);
177 #if defined(POOL_AVG) || defined(POOL_L2)
179 res =
DIV_OP(res,
calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
182 #if defined(QUANTIZED)
184 DATA_TYPE result_q8 =
CONVERT(res, DATA_TYPE);
186 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
188 const float result_f32 = convert_float(result_q8);
189 const float input_offset = (float)OFFSET_IN1;
190 const float input_scale = (float)SCALE_IN1;
191 const float scale_out = (float)SCALE_OUT;
192 const float offset_out = (float)OFFSET_OUT;
193 const float in_f32 = (result_f32 - input_offset) * input_scale;
194 const float out_f32 = in_f32 / scale_out + offset_out;
195 result_q8 =
CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
199 *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 *
sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = result_q8;
201 #else // defined(QUANTIZED)
209 *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 *
sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = (DATA_TYPE)res;
210 #endif // defined(QUANTIZED)
212 #endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
251 int id0 = get_global_id(0);
252 int id1 = get_global_id(1);
253 int id2 = get_global_id(2);
255 int2 x_coords =
clamp((int2)((id0 * STRIDE_X) - PAD_X), (int2)0, (int2)(SRC_WIDTH - 1));
256 int2 y_coords =
clamp((int2)((id1 * STRIDE_Y) - PAD_Y) +
VEC_OFFS(
int, 2), (int2)0, (int2)(SRC_HEIGHT - 1));
258 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + id2 * src_stride_z;
262 data0 =
VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s0 *
sizeof(DATA_TYPE) + y_coords.s0 * (int)src_stride_y));
264 data1 =
VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s1 *
sizeof(DATA_TYPE) + y_coords.s1 * (int)src_stride_y));
267 DATA_TYPE data0_max =
POOL_OP(data0.s0, data0.s1);
268 DATA_TYPE data1_max =
POOL_OP(data1.s0, data1.s1);
269 DATA_TYPE res =
POOL_OP(data0_max, data1_max);
271 *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 *
sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = res;
273 #if defined(SRC_BATCH)
275 uint offset_top = (x_coords.s0 + y_coords.s0 * SRC_WIDTH + id2 * (SRC_WIDTH * SRC_HEIGHT)) % SRC_BATCH;
276 uint offset_bottom = offset_top + SRC_WIDTH;
278 uint index0 =
select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
279 uint index1 =
select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
280 uint index =
select(index1, index0, isgreaterequal(data0_max, data1_max));
282 *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 *
sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index;
284 #endif // defined(SRC_BATCH)