26 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) & defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
52 __kernel
void compute_mean_var(
60 const int ch = get_global_id(0);
61 const int batch = get_global_id(1);
62 const int elements_plane = DIM_Y * DIM_Z;
63 INTERNAL_DATA_TYPE part_sum = 0.f;
64 INTERNAL_DATA_TYPE part_sum_sq = 0.f;
65 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch *
sizeof(DATA_TYPE);
67 for(
int i_w = 0; i_w < DIM_Y; ++i_w)
69 for(
int i_h = 0; i_h < DIM_Z; ++i_h)
71 INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE) * ((__global DATA_TYPE *)
tensor4D_offset(&in, ch, i_w, i_h, batch));
73 part_sum_sq += data * data;
77 INTERNAL_DATA_TYPE mean = (part_sum / elements_plane);
78 INTERNAL_DATA_TYPE var = (part_sum_sq / elements_plane) - (mean * mean);
79 __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)
tensor3D_offset(&out, ch, 0, batch);
80 *output_address0 = mean;
81 __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)
tensor3D_offset(&out, ch, 1, batch);
82 *output_address1 = var;
83 #else // !defined(NHWC)
84 const int ch = get_global_id(2) % DIM_Z;
85 const int batch = get_global_id(2) / DIM_Z;
86 const int elements_plane = DIM_X * DIM_Y;
93 for(
int y = 0; y < DIM_Y; ++y)
102 part_sum_sq += data * data;
105 for(; x < DIM_X; ++x)
107 INTERNAL_DATA_TYPE data = (INTERNAL_DATA_TYPE)(*((__global DATA_TYPE *)
tensor4D_offset(&in, x, y, ch, batch)));
109 part_sum_sq.s0 += data * data;
114 part_sum.s01234567 += part_sum.s89abcdef;
115 part_sum_sq.s01234567 += part_sum_sq.s89abcdef;
116 #endif // VEC_SIZE > 8
118 part_sum.s0123 += part_sum.s4567;
119 part_sum_sq.s0123 += part_sum_sq.s4567;
120 #endif // VEC_SIZE > 4
122 part_sum.s01 += part_sum.s23;
123 part_sum_sq.s01 += part_sum_sq.s23;
124 #endif // VEC_SIZE > 2
125 part_sum.s0 += part_sum.s1;
126 part_sum_sq.s0 += part_sum_sq.s1;
128 INTERNAL_DATA_TYPE sum = (INTERNAL_DATA_TYPE)part_sum.s0;
129 INTERNAL_DATA_TYPE sum_sq = (INTERNAL_DATA_TYPE)part_sum_sq.s0;
131 const INTERNAL_DATA_TYPE mean = (sum / elements_plane);
132 const INTERNAL_DATA_TYPE var = (sum_sq / elements_plane) - (mean * mean);
134 __global INTERNAL_DATA_TYPE *output_address0 = (__global INTERNAL_DATA_TYPE *)
tensor3D_offset(&out, ch, 0, batch);
135 *output_address0 = mean;
136 __global INTERNAL_DATA_TYPE *output_address1 = (__global INTERNAL_DATA_TYPE *)
tensor3D_offset(&out, ch, 1, batch);
137 *output_address1 = var;
139 #endif // defined(NHWC)
143 #if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(INTERNAL_DATA_TYPE) && defined(GAMMA) && defined(BETA) && defined(EPSILON) && defined(DIM_X) && defined(DIM_Y) && defined(DIM_Z)
186 const int ch = get_global_id(0);
187 const int batch = get_global_id(2);
189 const int ch = get_global_id(2) % DIM_Z;
190 const int batch = get_global_id(2) / DIM_Z;
193 const __global INTERNAL_DATA_TYPE *mean_ptr = (__global INTERNAL_DATA_TYPE *)
tensor3D_offset(&mean_var, ch, 0, batch);
194 const __global INTERNAL_DATA_TYPE *var_ptr = (__global INTERNAL_DATA_TYPE *)
tensor3D_offset(&mean_var, ch, 1, batch);
195 const INTERNAL_DATA_TYPE mean = (INTERNAL_DATA_TYPE) * mean_ptr;
196 const INTERNAL_DATA_TYPE var = (INTERNAL_DATA_TYPE) * var_ptr;
197 const INTERNAL_DATA_TYPE multip = GAMMA / sqrt(var + EPSILON);
198 const INTERNAL_DATA_TYPE beta = (INTERNAL_DATA_TYPE)BETA;
201 const int in_offset = input_offset_first_element_in_bytes + batch * input_stride_w + ch *
sizeof(DATA_TYPE);
203 const int out_offset = output_offset_first_element_in_bytes + batch * input_stride_w + ch *
sizeof(DATA_TYPE);
206 for(
int i_w = 0; i_w < DIM_Y; ++i_w)
208 for(
int i_h = 0; i_h < DIM_Z; ++i_h)
210 __global DATA_TYPE *input_address = (__global DATA_TYPE *)
tensor4D_offset(&in, ch, i_w, i_h, batch);
212 __global DATA_TYPE *output_address = input_address;
214 __global DATA_TYPE *output_address = (__global DATA_TYPE *)
tensor4D_offset(&out, ch, i_w, i_h, batch);
216 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
219 #else // !defined(NHWC)
220 for(
int y = 0; y < DIM_Y; ++y)
225 __global DATA_TYPE *input_address = (__global DATA_TYPE *)
tensor4D_offset(&in, x, y, ch, batch);
227 __global DATA_TYPE *output_address = input_address;
229 __global DATA_TYPE *output_address = (__global DATA_TYPE *)
tensor4D_offset(&out, x, y, ch, batch);
236 res = (data - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
241 for(; x < DIM_X; ++x)
243 __global DATA_TYPE *input_address = (__global DATA_TYPE *)
tensor4D_offset(&in, x, y, ch, batch);
245 __global DATA_TYPE *output_address = input_address;
247 __global DATA_TYPE *output_address = (__global DATA_TYPE *)
tensor4D_offset(&out, x, y, ch, batch);
249 *(output_address) = (*(input_address) - mean) * multip + (INTERNAL_DATA_TYPE)BETA;
252 #endif // defined(NHWC)