26 #if defined(POOL_AVG) || defined(POOL_L2) 27 #define POOL_OP(x, y) ((x) + (y)) 29 #if defined(QUANTIZED) 30 #define POOL_OP(x, y) (max((x), (y))) 31 #else // defined(QUANTIZED) 32 #define POOL_OP(x, y) (fmax((x), (y))) 33 #endif // defined(QUANTIZED) 37 #define POW2_OP(x, vec_size) ((x) * (x)) 39 #define POW2_OP(x, vec_size) (x) 42 #define DIV_OP(x, y) (x * (1.f / y)) 43 #define SQRT_OP(x) sqrt((x)) 45 #if defined(FP_MIXED_PRECISION) || defined(QUANTIZED) 46 #define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n)) 47 #define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n) 49 #define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr) 52 ACC_DATA_TYPE
calculate_avg_scale(
const int pool_size_x,
const int pool_size_y,
const int upper_bound_w,
const int upper_bound_h,
53 const int pad_x,
const int pad_y,
const int stride_x,
const int stride_y)
55 int start_x = get_global_id(0) * stride_x - pad_x;
56 int start_y = get_global_id(1) * stride_y - pad_y;
57 const int end_x = min(start_x + pool_size_x, upper_bound_w);
58 const int end_y = min(start_y + pool_size_y, upper_bound_h);
59 #if defined(EXCLUDE_PADDING) 60 start_x = max(0, start_x);
61 start_y = max(0, start_y);
63 return ((end_y - start_y) * (end_x - start_x));
66 #if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) 96 __kernel
void pooling_layer_MxN_nchw(
100 int id0 = get_global_id(0);
101 int id1 = get_global_id(1);
102 int id2 = get_global_id(2);
104 int x_coords = (id0 * STRIDE_X) - PAD_X;
105 int y_coords = (id1 * STRIDE_Y) - PAD_Y;
107 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + y_coords * (int)src_stride_y + id2 * src_stride_z;
110 vdata = INITIAL_VALUE;
111 ACC_DATA_TYPE sdata = INITIAL_VALUE;
113 const int end_x = min((
int)POOL_SIZE_X, (
int)(SRC_WIDTH - x_coords));
114 const int end_y = min((
int)POOL_SIZE_Y, (
int)(SRC_HEIGHT - y_coords));
117 for(
int y = 0; y < end_y; ++y)
119 if((y_coords + y) >= 0)
122 for(; x <= (end_x - 8); x += 8)
124 int8 src_x = (int8)(x_coords + x) +
VEC_OFFS(
int, 8);
125 #if defined(POOL_AVG) || defined(POOL_L2) 128 src_x =
clamp(src_x, (int8)0, (int8)(SRC_WIDTH - 1));
131 #else // defined(POOL_AVG) || defined(POOL_L2) 132 src_x =
clamp(src_x, 0, SRC_WIDTH - 1);
135 #endif // defined(POOL_AVG) || defined(POOL_L2 146 for(; x < end_x; ++x)
148 int src_x = x_coords + x;
149 #if defined(POOL_AVG) || defined(POOL_L2) 151 cond_x = (src_x < 0);
152 src_x =
clamp(src_x, 0, SRC_WIDTH - 1);
153 ACC_DATA_TYPE data0 =
select((ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x *
sizeof(DATA_TYPE) + y * src_stride_y))), (ACC_DATA_TYPE)0, cond_x);
154 #else // defined(POOL_AVG) || defined(POOL_L2) 155 src_x =
clamp(src_x, 0, SRC_WIDTH - 1);
156 ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)(src_addr + src_x *
sizeof(DATA_TYPE) + y * src_stride_y)));
157 #endif // defined(POOL_AVG) || defined(POOL_L2) 171 reduce4 =
POOL_OP(vdata.s0123, vdata.s4567);
173 reduce2 =
POOL_OP(reduce4.s01, reduce4.s23);
174 ACC_DATA_TYPE res =
POOL_OP(reduce2.s0, reduce2.s1);
177 #if defined(POOL_AVG) || defined(POOL_L2) 179 res =
DIV_OP(res,
calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
182 #if defined(QUANTIZED) 184 DATA_TYPE result_q8 =
CONVERT(res, DATA_TYPE);
186 #if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) 188 const float result_f32 = convert_float(result_q8);
189 const float input_offset = (float)OFFSET_IN1;
190 const float input_scale = (float)SCALE_IN1;
191 const float scale_out = (float)SCALE_OUT;
192 const float offset_out = (float)OFFSET_OUT;
193 const float in_f32 = (result_f32 - input_offset) * input_scale;
194 const float out_f32 = in_f32 / scale_out + offset_out;
195 result_q8 =
CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
199 *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 *
sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = result_q8;
201 #else // defined(QUANTIZED) 209 *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 *
sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = (DATA_TYPE)res;
210 #endif // defined(QUANTIZED) 212 #endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) 251 int id0 = get_global_id(0);
252 int id1 = get_global_id(1);
253 int id2 = get_global_id(2);
255 int2 x_coords =
clamp((int2)((id0 * STRIDE_X) - PAD_X), (int2)0, (int2)(SRC_WIDTH - 1));
256 int2 y_coords =
clamp((int2)((id1 * STRIDE_Y) - PAD_Y) +
VEC_OFFS(
int, 2), (int2)0, (int2)(SRC_HEIGHT - 1));
258 __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + id2 * src_stride_z;
262 data0 =
VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s0 *
sizeof(DATA_TYPE) + y_coords.s0 * (int)src_stride_y));
264 data1 =
VLOAD(2)(0, (__global DATA_TYPE *)(src_addr + x_coords.s1 *
sizeof(DATA_TYPE) + y_coords.s1 * (int)src_stride_y));
267 DATA_TYPE data0_max =
POOL_OP(data0.s0, data0.s1);
268 DATA_TYPE data1_max =
POOL_OP(data1.s0, data1.s1);
269 DATA_TYPE res =
POOL_OP(data0_max, data1_max);
271 *(__global DATA_TYPE *)(dst_ptr + dst_offset_first_element_in_bytes + id0 *
sizeof(DATA_TYPE) + id1 * dst_stride_y + id2 * dst_stride_z) = res;
273 #if defined(SRC_BATCH) 275 uint offset_top = (x_coords.s0 + y_coords.s0 * SRC_WIDTH + id2 * (SRC_WIDTH * SRC_HEIGHT)) % SRC_BATCH;
276 uint offset_bottom = offset_top + SRC_WIDTH;
278 uint index0 =
select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
279 uint index1 =
select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
280 uint index =
select(index1, index0, isgreaterequal(data0_max, data1_max));
282 *(__global uint *)(indices_ptr + indices_offset_first_element_in_bytes + id0 *
sizeof(uint) + id1 * indices_stride_y + id2 * indices_stride_z) = index;
284 #endif // defined(SRC_BATCH)
ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, const int pad_x, const int pad_y, const int stride_x, const int stride_y)
SimpleTensor< float > src
DataType clamp(const DataType &n, const DataType &lower=std::numeric_limits< RangeType >::lowest(), const DataType &upper=std::numeric_limits< RangeType >::max())
Performs clamping among a lower and upper value.
__kernel void pooling_layer_2_nchw_indices(__global uchar *src_ptr, uint src_stride_x, uint src_step_x, uint src_stride_y, uint src_step_y, uint src_stride_z, uint src_step_z, uint src_offset_first_element_in_bytes, __global uchar *dst_ptr, uint dst_stride_x, uint dst_step_x, uint dst_stride_y, uint dst_step_y, uint dst_stride_z, uint dst_step_z, uint dst_offset_first_element_in_bytes, __global uchar *indices_ptr, uint indices_stride_x, uint indices_step_x, uint indices_stride_y, uint indices_step_y, uint indices_stride_z, uint indices_step_z, uint indices_offset_first_element_in_bytes)
Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
#define SELECT_DATA_TYPE(type)
#define CONVERT_SAT(x, type)
SimpleTensor< T > select(const SimpleTensor< uint8_t > &c, const SimpleTensor< T > &x, const SimpleTensor< T > &y)
#define SELECT_VEC_DATA_TYPE(type, size)
#define TENSOR3D_DECLARATION(name)
#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr)
#define VEC_DATA_TYPE(type, size)